Commit
ยท
54c0e82
1
Parent(s):
d9d7415
Test GGUF with lightweight build
Browse files- src/generator/generator_gguf.py +36 -52
- src/utils/config.py +59 -0
src/generator/generator_gguf.py
CHANGED
|
@@ -78,18 +78,25 @@ class GGUFGenerator:
|
|
| 78 |
return
|
| 79 |
|
| 80 |
try:
|
|
|
|
|
|
|
|
|
|
| 81 |
# Model Hub ์ฌ์ฉ ์ฌ๋ถ์ ๋ฐ๋ผ ๊ฒฝ๋ก ๊ฒฐ์
|
| 82 |
-
if
|
| 83 |
# === Model Hub์์ ๋ค์ด๋ก๋ ===
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
from huggingface_hub import hf_hub_download
|
| 87 |
|
| 88 |
model_path = hf_hub_download(
|
| 89 |
-
repo_id=
|
| 90 |
-
filename=
|
| 91 |
-
cache_dir=
|
| 92 |
-
local_dir=
|
| 93 |
local_dir_use_symlinks=False # ์ฌ๋ณผ๋ฆญ ๋งํฌ ๋์ ์ค์ ๋ณต์ฌ
|
| 94 |
)
|
| 95 |
|
|
@@ -97,7 +104,7 @@ class GGUFGenerator:
|
|
| 97 |
|
| 98 |
else:
|
| 99 |
# === ๋ก์ปฌ ํ์ผ ์ฌ์ฉ ===
|
| 100 |
-
model_path = self.
|
| 101 |
|
| 102 |
if not os.path.exists(model_path):
|
| 103 |
raise FileNotFoundError(
|
|
@@ -305,33 +312,42 @@ class GGUFRAGPipeline:
|
|
| 305 |
alpha: ์๋ฒ ๋ฉ ๊ฐ์ค์น
|
| 306 |
"""
|
| 307 |
self.config = config or RAGConfig()
|
| 308 |
-
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# ๊ฒ์ ์ค์
|
| 311 |
-
self.search_mode = search_mode or self.config
|
| 312 |
-
self.alpha = alpha if alpha is not None else self.config.
|
| 313 |
|
| 314 |
# Retriever ์ด๊ธฐํ (RAGRetriever ์ฌ์ฉ)
|
| 315 |
logger.info("RAGRetriever ์ด๊ธฐํ ์ค...")
|
| 316 |
from src.retriever.retriever import RAGRetriever
|
| 317 |
self.retriever = RAGRetriever(config=self.config)
|
| 318 |
|
| 319 |
-
# GGUF ์ค์ (ํ๋ผ๋ฏธํฐ๊ฐ ์ฃผ์ด์ง๋ฉด config
|
| 320 |
-
gguf_n_gpu_layers = n_gpu_layers if n_gpu_layers is not None else self.config
|
| 321 |
-
gguf_n_ctx = n_ctx if n_ctx is not None else self.config
|
| 322 |
-
gguf_n_threads = n_threads if n_threads is not None else self.config
|
| 323 |
-
gguf_max_new_tokens = max_new_tokens if max_new_tokens is not None else self.config
|
| 324 |
-
gguf_temperature = temperature if temperature is not None else self.config.
|
| 325 |
-
gguf_top_p = top_p if top_p is not None else self.config.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
# GGUFGenerator ์ด๊ธฐํ
|
| 328 |
logger.info("GGUFGenerator ์ด๊ธฐํ ์ค...")
|
| 329 |
logger.info(f" GPU ๋ ์ด์ด: {gguf_n_gpu_layers}")
|
| 330 |
logger.info(f" ์ปจํ
์คํธ: {gguf_n_ctx}")
|
| 331 |
logger.info(f" ์ค๋ ๋: {gguf_n_threads}")
|
|
|
|
| 332 |
|
| 333 |
self.generator = GGUFGenerator(
|
| 334 |
-
model_path=
|
| 335 |
n_gpu_layers=gguf_n_gpu_layers,
|
| 336 |
n_ctx=gguf_n_ctx,
|
| 337 |
n_threads=gguf_n_threads,
|
|
@@ -339,7 +355,7 @@ class GGUFRAGPipeline:
|
|
| 339 |
max_new_tokens=gguf_max_new_tokens,
|
| 340 |
temperature=gguf_temperature,
|
| 341 |
top_p=gguf_top_p,
|
| 342 |
-
system_prompt=
|
| 343 |
)
|
| 344 |
|
| 345 |
# ๋ชจ๋ธ ๋ก๋ (์๊ฐ ์์)
|
|
@@ -552,36 +568,4 @@ class GGUFRAGPipeline:
|
|
| 552 |
logger.info(
|
| 553 |
f"๐ง ๊ฒ์ ์ค์ ๋ณ๊ฒฝ: mode={self.search_mode}, "
|
| 554 |
f"top_k={self.top_k}, alpha={self.alpha}"
|
| 555 |
-
)
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
# ํ
์คํธ์ฉ
|
| 559 |
-
if __name__ == "__main__":
|
| 560 |
-
from src.utils.config import RAGConfig
|
| 561 |
-
|
| 562 |
-
config = RAGConfig()
|
| 563 |
-
|
| 564 |
-
# GGUFRAGPipeline ์ด๊ธฐํ
|
| 565 |
-
pipeline = GGUFRAGPipeline(config=config)
|
| 566 |
-
|
| 567 |
-
# ํ
์คํธ ์ง๋ฌธ๋ค
|
| 568 |
-
test_questions = [
|
| 569 |
-
"์๋
ํ์ธ์",
|
| 570 |
-
"๋ณธ ์ฌ์
์ ์์ฐ ๋ฒ์๋ ์ด๋ป๊ฒ ๋๋์?",
|
| 571 |
-
"๊ณ ๋ง์์!"
|
| 572 |
-
]
|
| 573 |
-
|
| 574 |
-
for question in test_questions:
|
| 575 |
-
print("\n" + "="*50)
|
| 576 |
-
print("ํ
์คํธ ์ง๋ฌธ:", question)
|
| 577 |
-
print("="*50)
|
| 578 |
-
|
| 579 |
-
result = pipeline.generate_answer(question)
|
| 580 |
-
|
| 581 |
-
print(f"\n๋ผ์ฐํ
: {result['routing_info']['route']}")
|
| 582 |
-
print(f"๊ฒ์ ์ฌ์ฉ: {result['used_retrieval']}")
|
| 583 |
-
print("\n์๋ต:")
|
| 584 |
-
print(result['answer'])
|
| 585 |
-
print(f"\n์์ ์๊ฐ: {result['elapsed_time']:.2f}์ด")
|
| 586 |
-
print(f"์ฐธ๊ณ ๋ฌธ์: {len(result['sources'])}๊ฐ")
|
| 587 |
-
print("="*50)
|
|
|
|
| 78 |
return
|
| 79 |
|
| 80 |
try:
|
| 81 |
+
# Config์์ USE_MODEL_HUB ํ์ธ (์์ผ๋ฉด True ๊ธฐ๋ณธ๊ฐ)
|
| 82 |
+
use_model_hub = getattr(self.config, 'USE_MODEL_HUB', True)
|
| 83 |
+
|
| 84 |
# Model Hub ์ฌ์ฉ ์ฌ๋ถ์ ๋ฐ๋ผ ๊ฒฝ๋ก ๊ฒฐ์
|
| 85 |
+
if use_model_hub:
|
| 86 |
# === Model Hub์์ ๋ค์ด๋ก๋ ===
|
| 87 |
+
model_hub_repo = getattr(self.config, 'MODEL_HUB_REPO', 'beomi/Llama-3-Open-Ko-8B-gguf')
|
| 88 |
+
model_hub_filename = getattr(self.config, 'MODEL_HUB_FILENAME', 'ggml-model-Q4_K_M.gguf')
|
| 89 |
+
model_cache_dir = getattr(self.config, 'MODEL_CACHE_DIR', '.cache/models')
|
| 90 |
+
|
| 91 |
+
logger.info(f"๐ฅ Model Hub์์ ๋ค์ด๋ก๋: {model_hub_repo}")
|
| 92 |
|
| 93 |
from huggingface_hub import hf_hub_download
|
| 94 |
|
| 95 |
model_path = hf_hub_download(
|
| 96 |
+
repo_id=model_hub_repo,
|
| 97 |
+
filename=model_hub_filename,
|
| 98 |
+
cache_dir=model_cache_dir,
|
| 99 |
+
local_dir=model_cache_dir,
|
| 100 |
local_dir_use_symlinks=False # ์ฌ๋ณผ๋ฆญ ๋งํฌ ๋์ ์ค์ ๋ณต์ฌ
|
| 101 |
)
|
| 102 |
|
|
|
|
| 104 |
|
| 105 |
else:
|
| 106 |
# === ๋ก์ปฌ ํ์ผ ์ฌ์ฉ ===
|
| 107 |
+
model_path = self.model_path # ์์ฑ์์์ ๋ฐ์ ๊ฒฝ๋ก ์ฌ์ฉ
|
| 108 |
|
| 109 |
if not os.path.exists(model_path):
|
| 110 |
raise FileNotFoundError(
|
|
|
|
| 312 |
alpha: ์๋ฒ ๋ฉ ๊ฐ์ค์น
|
| 313 |
"""
|
| 314 |
self.config = config or RAGConfig()
|
| 315 |
+
|
| 316 |
+
# Config์์ ๊ธฐ๋ณธ๊ฐ ๊ฐ์ ธ์ค๊ธฐ (์์ผ๋ฉด fallback)
|
| 317 |
+
self.top_k = top_k or getattr(self.config, 'DEFAULT_TOP_K', 10)
|
| 318 |
|
| 319 |
# ๊ฒ์ ์ค์
|
| 320 |
+
self.search_mode = search_mode or getattr(self.config, 'DEFAULT_SEARCH_MODE', 'hybrid_rerank')
|
| 321 |
+
self.alpha = alpha if alpha is not None else getattr(self.config, 'DEFAULT_ALPHA', 0.5)
|
| 322 |
|
| 323 |
# Retriever ์ด๊ธฐํ (RAGRetriever ์ฌ์ฉ)
|
| 324 |
logger.info("RAGRetriever ์ด๊ธฐํ ์ค...")
|
| 325 |
from src.retriever.retriever import RAGRetriever
|
| 326 |
self.retriever = RAGRetriever(config=self.config)
|
| 327 |
|
| 328 |
+
# GGUF ์ค์ (ํ๋ผ๋ฏธํฐ๊ฐ ์ฃผ์ด์ง๋ฉด config ์ค๋ฒ๋ผ์ด๋, ์์ผ๋ฉด ๊ธฐ๋ณธ๊ฐ)
|
| 329 |
+
gguf_n_gpu_layers = n_gpu_layers if n_gpu_layers is not None else getattr(self.config, 'GGUF_N_GPU_LAYERS', 35)
|
| 330 |
+
gguf_n_ctx = n_ctx if n_ctx is not None else getattr(self.config, 'GGUF_N_CTX', 2048)
|
| 331 |
+
gguf_n_threads = n_threads if n_threads is not None else getattr(self.config, 'GGUF_N_THREADS', 4)
|
| 332 |
+
gguf_max_new_tokens = max_new_tokens if max_new_tokens is not None else getattr(self.config, 'GGUF_MAX_NEW_TOKENS', 512)
|
| 333 |
+
gguf_temperature = temperature if temperature is not None else getattr(self.config, 'GGUF_TEMPERATURE', 0.7)
|
| 334 |
+
gguf_top_p = top_p if top_p is not None else getattr(self.config, 'GGUF_TOP_P', 0.9)
|
| 335 |
+
|
| 336 |
+
# ๋ชจ๋ธ ๊ฒฝ๋ก (fallback)
|
| 337 |
+
gguf_model_path = getattr(self.config, 'GGUF_MODEL_PATH', '.cache/models/llama-3-ko-8b.gguf')
|
| 338 |
+
|
| 339 |
+
# ์์คํ
ํ๋กฌํํธ (fallback)
|
| 340 |
+
system_prompt = getattr(self.config, 'SYSTEM_PROMPT', '๋น์ ์ ํ๊ตญ ๊ณต๊ณต๊ธฐ๊ด ์ฌ์
์ ์์ ๋ถ์ ์ ๋ฌธ๊ฐ์
๋๋ค.')
|
| 341 |
|
| 342 |
# GGUFGenerator ์ด๊ธฐํ
|
| 343 |
logger.info("GGUFGenerator ์ด๊ธฐํ ์ค...")
|
| 344 |
logger.info(f" GPU ๋ ์ด์ด: {gguf_n_gpu_layers}")
|
| 345 |
logger.info(f" ์ปจํ
์คํธ: {gguf_n_ctx}")
|
| 346 |
logger.info(f" ์ค๋ ๋: {gguf_n_threads}")
|
| 347 |
+
logger.info(f" ๋ชจ๋ธ ๊ฒฝ๋ก: {gguf_model_path}")
|
| 348 |
|
| 349 |
self.generator = GGUFGenerator(
|
| 350 |
+
model_path=gguf_model_path,
|
| 351 |
n_gpu_layers=gguf_n_gpu_layers,
|
| 352 |
n_ctx=gguf_n_ctx,
|
| 353 |
n_threads=gguf_n_threads,
|
|
|
|
| 355 |
max_new_tokens=gguf_max_new_tokens,
|
| 356 |
temperature=gguf_temperature,
|
| 357 |
top_p=gguf_top_p,
|
| 358 |
+
system_prompt=system_prompt
|
| 359 |
)
|
| 360 |
|
| 361 |
# ๋ชจ๋ธ ๋ก๋ (์๊ฐ ์์)
|
|
|
|
| 568 |
logger.info(
|
| 569 |
f"๐ง ๊ฒ์ ์ค์ ๋ณ๊ฒฝ: mode={self.search_mode}, "
|
| 570 |
f"top_k={self.top_k}, alpha={self.alpha}"
|
| 571 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/config.py
CHANGED
|
@@ -52,6 +52,26 @@ class Config:
|
|
| 52 |
|
| 53 |
# ์์คํ
ํ๋กฌํํธ
|
| 54 |
self.SYSTEM_PROMPT = "๋น์ ์ RFP(์ ์์์ฒญ์) ๋ถ์ ๋ฐ ์์ฝ ์ ๋ฌธ๊ฐ์
๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def _get_api_key(self) -> str:
|
| 57 |
"""ํ๊ฒฝ๋ณ์์์ API ํค ๋ก๋"""
|
|
@@ -89,16 +109,55 @@ class Config:
|
|
| 89 |
raise ValueError("OPENAI_API_KEY๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค")
|
| 90 |
|
| 91 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
def validate_all(self):
|
| 94 |
"""์ ์ฒด ์ค์ ์ ํจ์ฑ ๊ฒ์ฌ"""
|
| 95 |
self.validate_preprocess()
|
| 96 |
self.validate_rag()
|
|
|
|
| 97 |
return True
|
| 98 |
|
| 99 |
def validate(self):
|
| 100 |
"""์ค์ ์ ํจ์ฑ ๊ฒ์ฌ (ํ์ ํธํ์ฑ)"""
|
| 101 |
return self.validate_preprocess()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
# ํ์ ํธํ์ฑ์ ์ํ ๋ณ์นญ
|
|
|
|
| 52 |
|
| 53 |
# ์์คํ
ํ๋กฌํํธ
|
| 54 |
self.SYSTEM_PROMPT = "๋น์ ์ RFP(์ ์์์ฒญ์) ๋ถ์ ๋ฐ ์์ฝ ์ ๋ฌธ๊ฐ์
๋๋ค."
|
| 55 |
+
|
| 56 |
+
# ===== GGUF ๋ก์ปฌ ๋ชจ๋ธ ์ค์ =====
|
| 57 |
+
# Model Hub ์ฌ์ฉ ์ฌ๋ถ (ํ๊ฒฝ๋ณ์ ์ฐ์ )
|
| 58 |
+
self.USE_MODEL_HUB = os.getenv("USE_MODEL_HUB", "true").lower() == "true"
|
| 59 |
+
|
| 60 |
+
# Hugging Face Model Hub ์ค์
|
| 61 |
+
self.MODEL_HUB_REPO = os.getenv("MODEL_HUB_REPO", "beomi/Llama-3-Open-Ko-8B-gguf")
|
| 62 |
+
self.MODEL_HUB_FILENAME = os.getenv("MODEL_HUB_FILENAME", "ggml-model-Q4_K_M.gguf")
|
| 63 |
+
self.MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", ".cache/models")
|
| 64 |
+
|
| 65 |
+
# ๋ก์ปฌ ๊ฒฝ๋ก (USE_MODEL_HUB=false์ธ ๊ฒฝ์ฐ)
|
| 66 |
+
self.GGUF_MODEL_PATH = os.getenv("GGUF_MODEL_PATH", ".cache/models/llama-3-ko-8b-Q4_K_M.gguf")
|
| 67 |
+
|
| 68 |
+
# GGUF GPU ์ค์ (T4 Medium ์ต์ ํ)
|
| 69 |
+
self.GGUF_N_GPU_LAYERS = int(os.getenv("GGUF_N_GPU_LAYERS", "35")) # T4์์ 8B ๋ชจ๋ธ ์ ์ฒด๋ฅผ GPU์ ๋ก๋
|
| 70 |
+
self.GGUF_N_CTX = int(os.getenv("GGUF_N_CTX", "2048")) # ์ปจํ
์คํธ ๊ธธ์ด
|
| 71 |
+
self.GGUF_N_THREADS = int(os.getenv("GGUF_N_THREADS", "4")) # CPU ์ค๋ ๋ (GPU ์ฌ์ฉ ์ ๋ฎ๊ฒ)
|
| 72 |
+
self.GGUF_MAX_NEW_TOKENS = int(os.getenv("GGUF_MAX_NEW_TOKENS", "512")) # ์ต๋ ์์ฑ ํ ํฐ
|
| 73 |
+
self.GGUF_TEMPERATURE = float(os.getenv("GGUF_TEMPERATURE", "0.7")) # ์์ฑ ๋ค์์ฑ
|
| 74 |
+
self.GGUF_TOP_P = float(os.getenv("GGUF_TOP_P", "0.9")) # Nucleus sampling
|
| 75 |
|
| 76 |
def _get_api_key(self) -> str:
|
| 77 |
"""ํ๊ฒฝ๋ณ์์์ API ํค ๋ก๋"""
|
|
|
|
| 109 |
raise ValueError("OPENAI_API_KEY๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค")
|
| 110 |
|
| 111 |
return True
|
| 112 |
+
|
| 113 |
+
def validate_gguf(self):
|
| 114 |
+
"""GGUF ์ค์ ์ ํจ์ฑ ๊ฒ์ฌ"""
|
| 115 |
+
if not self.USE_MODEL_HUB:
|
| 116 |
+
# ๋ก์ปฌ ํ์ผ ์ฌ์ฉ ์ ๊ฒฝ๋ก ํ์ธ
|
| 117 |
+
if not os.path.exists(self.GGUF_MODEL_PATH):
|
| 118 |
+
print(f"โ ๏ธ ๊ฒฝ๊ณ : GGUF ๋ชจ๋ธ ํ์ผ์ด ์์ต๋๋ค: {self.GGUF_MODEL_PATH}")
|
| 119 |
+
print(f" USE_MODEL_HUB=true๋ก ์ค์ ํ์ฌ ์๋ ๋ค์ด๋ก๋ํ๊ฑฐ๋ ๋ชจ๋ธ ํ์ผ์ ์ค๋นํ์ธ์.")
|
| 120 |
+
|
| 121 |
+
# GPU ๋ ์ด์ด ์ค์ ํ์ธ
|
| 122 |
+
if self.GGUF_N_GPU_LAYERS > 0:
|
| 123 |
+
print(f"โ
GPU ๊ฐ์ ํ์ฑํ: {self.GGUF_N_GPU_LAYERS}๊ฐ ๋ ์ด์ด")
|
| 124 |
+
else:
|
| 125 |
+
print(f"โ ๏ธ CPU ์ ์ฉ ๋ชจ๋ (n_gpu_layers=0)")
|
| 126 |
+
|
| 127 |
+
return True
|
| 128 |
|
| 129 |
def validate_all(self):
|
| 130 |
"""์ ์ฒด ์ค์ ์ ํจ์ฑ ๊ฒ์ฌ"""
|
| 131 |
self.validate_preprocess()
|
| 132 |
self.validate_rag()
|
| 133 |
+
self.validate_gguf()
|
| 134 |
return True
|
| 135 |
|
| 136 |
def validate(self):
|
| 137 |
"""์ค์ ์ ํจ์ฑ ๊ฒ์ฌ (ํ์ ํธํ์ฑ)"""
|
| 138 |
return self.validate_preprocess()
|
| 139 |
+
|
| 140 |
+
def print_gguf_config(self):
|
| 141 |
+
"""GGUF ์ค์ ์ถ๋ ฅ (๋๋ฒ๊น
์ฉ)"""
|
| 142 |
+
print("\n" + "="*50)
|
| 143 |
+
print("GGUF ๋ชจ๋ธ ์ค์ ")
|
| 144 |
+
print("="*50)
|
| 145 |
+
print(f"Model Hub ์ฌ์ฉ: {self.USE_MODEL_HUB}")
|
| 146 |
+
if self.USE_MODEL_HUB:
|
| 147 |
+
print(f"Hub Repo: {self.MODEL_HUB_REPO}")
|
| 148 |
+
print(f"Hub ํ์ผ๋ช
: {self.MODEL_HUB_FILENAME}")
|
| 149 |
+
print(f"์บ์ ๋๋ ํ ๋ฆฌ: {self.MODEL_CACHE_DIR}")
|
| 150 |
+
else:
|
| 151 |
+
print(f"๋ก์ปฌ ๊ฒฝ๋ก: {self.GGUF_MODEL_PATH}")
|
| 152 |
+
print(f"\nGPU ์ค์ :")
|
| 153 |
+
print(f" - GPU ๋ ์ด์ด: {self.GGUF_N_GPU_LAYERS}")
|
| 154 |
+
print(f" - ์ปจํ
์คํธ: {self.GGUF_N_CTX}")
|
| 155 |
+
print(f" - ์ค๋ ๋: {self.GGUF_N_THREADS}")
|
| 156 |
+
print(f"\n์์ฑ ์ค์ :")
|
| 157 |
+
print(f" - Max Tokens: {self.GGUF_MAX_NEW_TOKENS}")
|
| 158 |
+
print(f" - Temperature: {self.GGUF_TEMPERATURE}")
|
| 159 |
+
print(f" - Top-P: {self.GGUF_TOP_P}")
|
| 160 |
+
print("="*50 + "\n")
|
| 161 |
|
| 162 |
|
| 163 |
# ํ์ ํธํ์ฑ์ ์ํ ๋ณ์นญ
|