Dongjin1203 commited on
Commit
54c0e82
ยท
1 Parent(s): d9d7415

Test GGUF with lightweight build

Browse files
src/generator/generator_gguf.py CHANGED
@@ -78,18 +78,25 @@ class GGUFGenerator:
78
  return
79
 
80
  try:
 
 
 
81
  # Model Hub ์‚ฌ์šฉ ์—ฌ๋ถ€์— ๋”ฐ๋ผ ๊ฒฝ๋กœ ๊ฒฐ์ •
82
- if self.config.USE_MODEL_HUB:
83
  # === Model Hub์—์„œ ๋‹ค์šด๋กœ๋“œ ===
84
- logger.info(f"๐Ÿ“ฅ Model Hub์—์„œ ๋‹ค์šด๋กœ๋“œ: {self.config.MODEL_HUB_REPO}")
 
 
 
 
85
 
86
  from huggingface_hub import hf_hub_download
87
 
88
  model_path = hf_hub_download(
89
- repo_id=self.config.MODEL_HUB_REPO,
90
- filename=self.config.MODEL_HUB_FILENAME,
91
- cache_dir=self.config.MODEL_CACHE_DIR,
92
- local_dir=self.config.MODEL_CACHE_DIR,
93
  local_dir_use_symlinks=False # ์‹ฌ๋ณผ๋ฆญ ๋งํฌ ๋Œ€์‹  ์‹ค์ œ ๋ณต์‚ฌ
94
  )
95
 
@@ -97,7 +104,7 @@ class GGUFGenerator:
97
 
98
  else:
99
  # === ๋กœ์ปฌ ํŒŒ์ผ ์‚ฌ์šฉ ===
100
- model_path = self.config.GGUF_MODEL_PATH
101
 
102
  if not os.path.exists(model_path):
103
  raise FileNotFoundError(
@@ -305,33 +312,42 @@ class GGUFRAGPipeline:
305
  alpha: ์ž„๋ฒ ๋”ฉ ๊ฐ€์ค‘์น˜
306
  """
307
  self.config = config or RAGConfig()
308
- self.top_k = top_k or self.config.DEFAULT_TOP_K
 
 
309
 
310
  # ๊ฒ€์ƒ‰ ์„ค์ •
311
- self.search_mode = search_mode or self.config.DEFAULT_SEARCH_MODE
312
- self.alpha = alpha if alpha is not None else self.config.DEFAULT_ALPHA
313
 
314
  # Retriever ์ดˆ๊ธฐํ™” (RAGRetriever ์‚ฌ์šฉ)
315
  logger.info("RAGRetriever ์ดˆ๊ธฐํ™” ์ค‘...")
316
  from src.retriever.retriever import RAGRetriever
317
  self.retriever = RAGRetriever(config=self.config)
318
 
319
- # GGUF ์„ค์ • (ํŒŒ๋ผ๋ฏธํ„ฐ๊ฐ€ ์ฃผ์–ด์ง€๋ฉด config ์˜ค๋ฒ„๋ผ์ด๋“œ)
320
- gguf_n_gpu_layers = n_gpu_layers if n_gpu_layers is not None else self.config.GGUF_N_GPU_LAYERS
321
- gguf_n_ctx = n_ctx if n_ctx is not None else self.config.GGUF_N_CTX
322
- gguf_n_threads = n_threads if n_threads is not None else self.config.GGUF_N_THREADS
323
- gguf_max_new_tokens = max_new_tokens if max_new_tokens is not None else self.config.GGUF_MAX_NEW_TOKENS
324
- gguf_temperature = temperature if temperature is not None else self.config.GGUF_TEMPERATURE
325
- gguf_top_p = top_p if top_p is not None else self.config.GGUF_TOP_P
 
 
 
 
 
 
326
 
327
  # GGUFGenerator ์ดˆ๊ธฐํ™”
328
  logger.info("GGUFGenerator ์ดˆ๊ธฐํ™” ์ค‘...")
329
  logger.info(f" GPU ๋ ˆ์ด์–ด: {gguf_n_gpu_layers}")
330
  logger.info(f" ์ปจํ…์ŠคํŠธ: {gguf_n_ctx}")
331
  logger.info(f" ์Šค๋ ˆ๋“œ: {gguf_n_threads}")
 
332
 
333
  self.generator = GGUFGenerator(
334
- model_path=self.config.GGUF_MODEL_PATH,
335
  n_gpu_layers=gguf_n_gpu_layers,
336
  n_ctx=gguf_n_ctx,
337
  n_threads=gguf_n_threads,
@@ -339,7 +355,7 @@ class GGUFRAGPipeline:
339
  max_new_tokens=gguf_max_new_tokens,
340
  temperature=gguf_temperature,
341
  top_p=gguf_top_p,
342
- system_prompt=self.config.SYSTEM_PROMPT
343
  )
344
 
345
  # ๋ชจ๋ธ ๋กœ๋“œ (์‹œ๊ฐ„ ์†Œ์š”)
@@ -552,36 +568,4 @@ class GGUFRAGPipeline:
552
  logger.info(
553
  f"๐Ÿ”ง ๊ฒ€์ƒ‰ ์„ค์ • ๋ณ€๊ฒฝ: mode={self.search_mode}, "
554
  f"top_k={self.top_k}, alpha={self.alpha}"
555
- )
556
-
557
-
558
- # ํ…Œ์ŠคํŠธ์šฉ
559
- if __name__ == "__main__":
560
- from src.utils.config import RAGConfig
561
-
562
- config = RAGConfig()
563
-
564
- # GGUFRAGPipeline ์ดˆ๊ธฐํ™”
565
- pipeline = GGUFRAGPipeline(config=config)
566
-
567
- # ํ…Œ์ŠคํŠธ ์งˆ๋ฌธ๋“ค
568
- test_questions = [
569
- "์•ˆ๋…•ํ•˜์„ธ์š”",
570
- "๋ณธ ์‚ฌ์—…์˜ ์˜ˆ์‚ฐ ๋ฒ”์œ„๋Š” ์–ด๋–ป๊ฒŒ ๋˜๋‚˜์š”?",
571
- "๊ณ ๋งˆ์›Œ์š”!"
572
- ]
573
-
574
- for question in test_questions:
575
- print("\n" + "="*50)
576
- print("ํ…Œ์ŠคํŠธ ์งˆ๋ฌธ:", question)
577
- print("="*50)
578
-
579
- result = pipeline.generate_answer(question)
580
-
581
- print(f"\n๋ผ์šฐํŒ…: {result['routing_info']['route']}")
582
- print(f"๊ฒ€์ƒ‰ ์‚ฌ์šฉ: {result['used_retrieval']}")
583
- print("\n์‘๋‹ต:")
584
- print(result['answer'])
585
- print(f"\n์†Œ์š” ์‹œ๊ฐ„: {result['elapsed_time']:.2f}์ดˆ")
586
- print(f"์ฐธ๊ณ  ๋ฌธ์„œ: {len(result['sources'])}๊ฐœ")
587
- print("="*50)
 
78
  return
79
 
80
  try:
81
+ # Config์—์„œ USE_MODEL_HUB ํ™•์ธ (์—†์œผ๋ฉด True ๊ธฐ๋ณธ๊ฐ’)
82
+ use_model_hub = getattr(self.config, 'USE_MODEL_HUB', True)
83
+
84
  # Model Hub ์‚ฌ์šฉ ์—ฌ๋ถ€์— ๋”ฐ๋ผ ๊ฒฝ๋กœ ๊ฒฐ์ •
85
+ if use_model_hub:
86
  # === Model Hub์—์„œ ๋‹ค์šด๋กœ๋“œ ===
87
+ model_hub_repo = getattr(self.config, 'MODEL_HUB_REPO', 'beomi/Llama-3-Open-Ko-8B-gguf')
88
+ model_hub_filename = getattr(self.config, 'MODEL_HUB_FILENAME', 'ggml-model-Q4_K_M.gguf')
89
+ model_cache_dir = getattr(self.config, 'MODEL_CACHE_DIR', '.cache/models')
90
+
91
+ logger.info(f"๐Ÿ“ฅ Model Hub์—์„œ ๋‹ค์šด๋กœ๋“œ: {model_hub_repo}")
92
 
93
  from huggingface_hub import hf_hub_download
94
 
95
  model_path = hf_hub_download(
96
+ repo_id=model_hub_repo,
97
+ filename=model_hub_filename,
98
+ cache_dir=model_cache_dir,
99
+ local_dir=model_cache_dir,
100
  local_dir_use_symlinks=False # ์‹ฌ๋ณผ๋ฆญ ๋งํฌ ๋Œ€์‹  ์‹ค์ œ ๋ณต์‚ฌ
101
  )
102
 
 
104
 
105
  else:
106
  # === ๋กœ์ปฌ ํŒŒ์ผ ์‚ฌ์šฉ ===
107
+ model_path = self.model_path # ์ƒ์„ฑ์ž์—์„œ ๋ฐ›์€ ๊ฒฝ๋กœ ์‚ฌ์šฉ
108
 
109
  if not os.path.exists(model_path):
110
  raise FileNotFoundError(
 
312
  alpha: ์ž„๋ฒ ๋”ฉ ๊ฐ€์ค‘์น˜
313
  """
314
  self.config = config or RAGConfig()
315
+
316
+ # Config์—์„œ ๊ธฐ๋ณธ๊ฐ’ ๊ฐ€์ ธ์˜ค๊ธฐ (์—†์œผ๋ฉด fallback)
317
+ self.top_k = top_k or getattr(self.config, 'DEFAULT_TOP_K', 10)
318
 
319
  # ๊ฒ€์ƒ‰ ์„ค์ •
320
+ self.search_mode = search_mode or getattr(self.config, 'DEFAULT_SEARCH_MODE', 'hybrid_rerank')
321
+ self.alpha = alpha if alpha is not None else getattr(self.config, 'DEFAULT_ALPHA', 0.5)
322
 
323
  # Retriever ์ดˆ๊ธฐํ™” (RAGRetriever ์‚ฌ์šฉ)
324
  logger.info("RAGRetriever ์ดˆ๊ธฐํ™” ์ค‘...")
325
  from src.retriever.retriever import RAGRetriever
326
  self.retriever = RAGRetriever(config=self.config)
327
 
328
+ # GGUF ์„ค์ • (ํŒŒ๋ผ๋ฏธํ„ฐ๊ฐ€ ์ฃผ์–ด์ง€๋ฉด config ์˜ค๋ฒ„๋ผ์ด๋“œ, ์—†์œผ๋ฉด ๊ธฐ๋ณธ๊ฐ’)
329
+ gguf_n_gpu_layers = n_gpu_layers if n_gpu_layers is not None else getattr(self.config, 'GGUF_N_GPU_LAYERS', 35)
330
+ gguf_n_ctx = n_ctx if n_ctx is not None else getattr(self.config, 'GGUF_N_CTX', 2048)
331
+ gguf_n_threads = n_threads if n_threads is not None else getattr(self.config, 'GGUF_N_THREADS', 4)
332
+ gguf_max_new_tokens = max_new_tokens if max_new_tokens is not None else getattr(self.config, 'GGUF_MAX_NEW_TOKENS', 512)
333
+ gguf_temperature = temperature if temperature is not None else getattr(self.config, 'GGUF_TEMPERATURE', 0.7)
334
+ gguf_top_p = top_p if top_p is not None else getattr(self.config, 'GGUF_TOP_P', 0.9)
335
+
336
+ # ๋ชจ๋ธ ๊ฒฝ๋กœ (fallback)
337
+ gguf_model_path = getattr(self.config, 'GGUF_MODEL_PATH', '.cache/models/llama-3-ko-8b.gguf')
338
+
339
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ (fallback)
340
+ system_prompt = getattr(self.config, 'SYSTEM_PROMPT', '๋‹น์‹ ์€ ํ•œ๊ตญ ๊ณต๊ณต๊ธฐ๊ด€ ์‚ฌ์—…์ œ์•ˆ์„œ ๋ถ„์„ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.')
341
 
342
  # GGUFGenerator ์ดˆ๊ธฐํ™”
343
  logger.info("GGUFGenerator ์ดˆ๊ธฐํ™” ์ค‘...")
344
  logger.info(f" GPU ๋ ˆ์ด์–ด: {gguf_n_gpu_layers}")
345
  logger.info(f" ์ปจํ…์ŠคํŠธ: {gguf_n_ctx}")
346
  logger.info(f" ์Šค๋ ˆ๋“œ: {gguf_n_threads}")
347
+ logger.info(f" ๋ชจ๋ธ ๊ฒฝ๋กœ: {gguf_model_path}")
348
 
349
  self.generator = GGUFGenerator(
350
+ model_path=gguf_model_path,
351
  n_gpu_layers=gguf_n_gpu_layers,
352
  n_ctx=gguf_n_ctx,
353
  n_threads=gguf_n_threads,
 
355
  max_new_tokens=gguf_max_new_tokens,
356
  temperature=gguf_temperature,
357
  top_p=gguf_top_p,
358
+ system_prompt=system_prompt
359
  )
360
 
361
  # ๋ชจ๋ธ ๋กœ๋“œ (์‹œ๊ฐ„ ์†Œ์š”)
 
568
  logger.info(
569
  f"๐Ÿ”ง ๊ฒ€์ƒ‰ ์„ค์ • ๋ณ€๊ฒฝ: mode={self.search_mode}, "
570
  f"top_k={self.top_k}, alpha={self.alpha}"
571
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/config.py CHANGED
@@ -52,6 +52,26 @@ class Config:
52
 
53
  # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
54
  self.SYSTEM_PROMPT = "๋‹น์‹ ์€ RFP(์ œ์•ˆ์š”์ฒญ์„œ) ๋ถ„์„ ๋ฐ ์š”์•ฝ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def _get_api_key(self) -> str:
57
  """ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ API ํ‚ค ๋กœ๋“œ"""
@@ -89,16 +109,55 @@ class Config:
89
  raise ValueError("OPENAI_API_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
90
 
91
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def validate_all(self):
94
  """์ „์ฒด ์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ"""
95
  self.validate_preprocess()
96
  self.validate_rag()
 
97
  return True
98
 
99
  def validate(self):
100
  """์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ (ํ•˜์œ„ ํ˜ธํ™˜์„ฑ)"""
101
  return self.validate_preprocess()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
  # ํ•˜์œ„ ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•œ ๋ณ„์นญ
 
52
 
53
  # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
54
  self.SYSTEM_PROMPT = "๋‹น์‹ ์€ RFP(์ œ์•ˆ์š”์ฒญ์„œ) ๋ถ„์„ ๋ฐ ์š”์•ฝ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค."
55
+
56
+ # ===== GGUF ๋กœ์ปฌ ๋ชจ๋ธ ์„ค์ • =====
57
+ # Model Hub ์‚ฌ์šฉ ์—ฌ๋ถ€ (ํ™˜๊ฒฝ๋ณ€์ˆ˜ ์šฐ์„ )
58
+ self.USE_MODEL_HUB = os.getenv("USE_MODEL_HUB", "true").lower() == "true"
59
+
60
+ # Hugging Face Model Hub ์„ค์ •
61
+ self.MODEL_HUB_REPO = os.getenv("MODEL_HUB_REPO", "beomi/Llama-3-Open-Ko-8B-gguf")
62
+ self.MODEL_HUB_FILENAME = os.getenv("MODEL_HUB_FILENAME", "ggml-model-Q4_K_M.gguf")
63
+ self.MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", ".cache/models")
64
+
65
+ # ๋กœ์ปฌ ๊ฒฝ๋กœ (USE_MODEL_HUB=false์ธ ๊ฒฝ์šฐ)
66
+ self.GGUF_MODEL_PATH = os.getenv("GGUF_MODEL_PATH", ".cache/models/llama-3-ko-8b-Q4_K_M.gguf")
67
+
68
+ # GGUF GPU ์„ค์ • (T4 Medium ์ตœ์ ํ™”)
69
+ self.GGUF_N_GPU_LAYERS = int(os.getenv("GGUF_N_GPU_LAYERS", "35")) # T4์—์„œ 8B ๋ชจ๋ธ ์ „์ฒด๋ฅผ GPU์— ๋กœ๋“œ
70
+ self.GGUF_N_CTX = int(os.getenv("GGUF_N_CTX", "2048")) # ์ปจํ…์ŠคํŠธ ๊ธธ์ด
71
+ self.GGUF_N_THREADS = int(os.getenv("GGUF_N_THREADS", "4")) # CPU ์Šค๋ ˆ๋“œ (GPU ์‚ฌ์šฉ ์‹œ ๋‚ฎ๊ฒŒ)
72
+ self.GGUF_MAX_NEW_TOKENS = int(os.getenv("GGUF_MAX_NEW_TOKENS", "512")) # ์ตœ๋Œ€ ์ƒ์„ฑ ํ† ํฐ
73
+ self.GGUF_TEMPERATURE = float(os.getenv("GGUF_TEMPERATURE", "0.7")) # ์ƒ์„ฑ ๋‹ค์–‘์„ฑ
74
+ self.GGUF_TOP_P = float(os.getenv("GGUF_TOP_P", "0.9")) # Nucleus sampling
75
 
76
  def _get_api_key(self) -> str:
77
  """ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ API ํ‚ค ๋กœ๋“œ"""
 
109
  raise ValueError("OPENAI_API_KEY๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
110
 
111
  return True
112
+
113
+ def validate_gguf(self):
114
+ """GGUF ์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ"""
115
+ if not self.USE_MODEL_HUB:
116
+ # ๋กœ์ปฌ ํŒŒ์ผ ์‚ฌ์šฉ ์‹œ ๊ฒฝ๋กœ ํ™•์ธ
117
+ if not os.path.exists(self.GGUF_MODEL_PATH):
118
+ print(f"โš ๏ธ ๊ฒฝ๊ณ : GGUF ๋ชจ๋ธ ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค: {self.GGUF_MODEL_PATH}")
119
+ print(f" USE_MODEL_HUB=true๋กœ ์„ค์ •ํ•˜์—ฌ ์ž๋™ ๋‹ค์šด๋กœ๋“œํ•˜๊ฑฐ๋‚˜ ๋ชจ๋ธ ํŒŒ์ผ์„ ์ค€๋น„ํ•˜์„ธ์š”.")
120
+
121
+ # GPU ๋ ˆ์ด์–ด ์„ค์ • ํ™•์ธ
122
+ if self.GGUF_N_GPU_LAYERS > 0:
123
+ print(f"โœ… GPU ๊ฐ€์† ํ™œ์„ฑํ™”: {self.GGUF_N_GPU_LAYERS}๊ฐœ ๋ ˆ์ด์–ด")
124
+ else:
125
+ print(f"โš ๏ธ CPU ์ „์šฉ ๋ชจ๋“œ (n_gpu_layers=0)")
126
+
127
+ return True
128
 
129
  def validate_all(self):
130
  """์ „์ฒด ์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ"""
131
  self.validate_preprocess()
132
  self.validate_rag()
133
+ self.validate_gguf()
134
  return True
135
 
136
  def validate(self):
137
  """์„ค์ • ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ (ํ•˜์œ„ ํ˜ธํ™˜์„ฑ)"""
138
  return self.validate_preprocess()
139
+
140
+ def print_gguf_config(self):
141
+ """GGUF ์„ค์ • ์ถœ๋ ฅ (๋””๋ฒ„๊น…์šฉ)"""
142
+ print("\n" + "="*50)
143
+ print("GGUF ๋ชจ๋ธ ์„ค์ •")
144
+ print("="*50)
145
+ print(f"Model Hub ์‚ฌ์šฉ: {self.USE_MODEL_HUB}")
146
+ if self.USE_MODEL_HUB:
147
+ print(f"Hub Repo: {self.MODEL_HUB_REPO}")
148
+ print(f"Hub ํŒŒ์ผ๋ช…: {self.MODEL_HUB_FILENAME}")
149
+ print(f"์บ์‹œ ๋””๋ ‰ํ† ๋ฆฌ: {self.MODEL_CACHE_DIR}")
150
+ else:
151
+ print(f"๋กœ์ปฌ ๊ฒฝ๋กœ: {self.GGUF_MODEL_PATH}")
152
+ print(f"\nGPU ์„ค์ •:")
153
+ print(f" - GPU ๋ ˆ์ด์–ด: {self.GGUF_N_GPU_LAYERS}")
154
+ print(f" - ์ปจํ…์ŠคํŠธ: {self.GGUF_N_CTX}")
155
+ print(f" - ์Šค๋ ˆ๋“œ: {self.GGUF_N_THREADS}")
156
+ print(f"\n์ƒ์„ฑ ์„ค์ •:")
157
+ print(f" - Max Tokens: {self.GGUF_MAX_NEW_TOKENS}")
158
+ print(f" - Temperature: {self.GGUF_TEMPERATURE}")
159
+ print(f" - Top-P: {self.GGUF_TOP_P}")
160
+ print("="*50 + "\n")
161
 
162
 
163
  # ํ•˜์œ„ ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•œ ๋ณ„์นญ