Dongjin1203 commited on
Commit
d9d7415
ยท
1 Parent(s): fca8e5d

Test GGUF with lightweight build

Browse files
Files changed (1) hide show
  1. src/generator/generator_gguf.py +52 -39
src/generator/generator_gguf.py CHANGED
@@ -1,4 +1,4 @@
1
- from llama_cpp import Llama # โ† ์ฃผ์„ ํ•ด์ œ!
2
  from typing import Optional, Dict, Any, List
3
  import logging
4
  import time
@@ -275,57 +275,70 @@ class GGUFRAGPipeline:
275
 
276
  def __init__(
277
  self,
278
- config: RAGConfig = None,
279
  model: str = None, # ํ˜ธํ™˜์„ฑ์šฉ (์‚ฌ์šฉ ์•ˆ ํ•จ)
280
- top_k: int = 10,
281
- n_gpu_layers: int = 0, # GPU ๋ ˆ์ด์–ด ์ˆ˜
282
- n_ctx: int = 2048,
283
- n_threads: int = 8,
284
- max_new_tokens: int = 256,
285
- temperature: float = 0.7,
286
- top_p: float = 0.9,
287
- search_mode: str = "hybrid_rerank",
288
- alpha: float = 0.5
 
289
  ):
290
  """
291
  ์ดˆ๊ธฐํ™”
292
 
293
  Args:
294
- config: RAGConfig ์ธ์Šคํ„ด์Šค
295
- n_gpu_layers: GPU ๋ ˆ์ด์–ด ์ˆ˜
296
- n_ctx: ์ปจํ…์ŠคํŠธ ๊ธธ์ด
297
- n_threads: CPU ์Šค๋ ˆ๋“œ ์ˆ˜
298
- max_new_tokens: ์ตœ๋Œ€ ์ƒ์„ฑ ํ† ํฐ
299
- temperature: ์ƒ์„ฑ ๋‹ค์–‘์„ฑ
300
- top_p: Nucleus sampling
 
 
301
  search_mode: ๊ฒ€์ƒ‰ ๋ชจ๋“œ
302
- top_k: ๊ฒ€์ƒ‰ํ•  ๋ฌธ์„œ ์ˆ˜
303
  alpha: ์ž„๋ฒ ๋”ฉ ๊ฐ€์ค‘์น˜
304
  """
305
  self.config = config or RAGConfig()
306
- self.search_mode = search_mode
307
- self.top_k = top_k
308
- self.alpha = alpha
309
-
310
- # Retriever ์ดˆ๊ธฐํ™”
311
- from src.retriever.hybrid_retriever import HybridRetriever
312
- self.retriever = HybridRetriever(
313
- collection_name=self.config.COLLECTION_NAME,
314
- persist_directory=self.config.CHROMA_DB_DIR,
315
- embedding_model_name=self.config.EMBEDDING_MODEL,
316
- reranker_model_name=self.config.RERANKER_MODEL
317
- )
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- # Generator ์ดˆ๊ธฐํ™”
320
  self.generator = GGUFGenerator(
321
  model_path=self.config.GGUF_MODEL_PATH,
322
- n_gpu_layers=n_gpu_layers,
323
- n_ctx=n_ctx,
324
- n_threads=n_threads,
325
  config=self.config,
326
- max_new_tokens=max_new_tokens,
327
- temperature=temperature,
328
- top_p=top_p,
329
  system_prompt=self.config.SYSTEM_PROMPT
330
  )
331
 
@@ -345,7 +358,7 @@ class GGUFRAGPipeline:
345
 
346
  def _retrieve_and_format(self, query: str) -> str:
347
  """๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ ๋ฐ ์ปจํ…์ŠคํŠธ ํฌ๋งทํŒ…"""
348
- # ๊ฒ€์ƒ‰ ๋ชจ๋“œ์— ๋”ฐ๋ผ ๋ฌธ์„œ ๊ฒ€์ƒ‰
349
  if self.search_mode == "embedding":
350
  docs = self.retriever.search(query, top_k=self.top_k)
351
  elif self.search_mode == "embedding_rerank":
 
1
+ from llama_cpp import Llama
2
  from typing import Optional, Dict, Any, List
3
  import logging
4
  import time
 
275
 
276
  def __init__(
277
  self,
278
+ config=None,
279
  model: str = None, # ํ˜ธํ™˜์„ฑ์šฉ (์‚ฌ์šฉ ์•ˆ ํ•จ)
280
+ top_k: int = None,
281
+ # GPU ์„ค์ • (์„ ํƒ์ , config ์˜ค๋ฒ„๋ผ์ด๋“œ)
282
+ n_gpu_layers: int = None,
283
+ n_ctx: int = None,
284
+ n_threads: int = None,
285
+ max_new_tokens: int = None,
286
+ temperature: float = None,
287
+ top_p: float = None,
288
+ search_mode: str = None,
289
+ alpha: float = None
290
  ):
291
  """
292
  ์ดˆ๊ธฐํ™”
293
 
294
  Args:
295
+ config: RAGConfig ๊ฐ์ฒด
296
+ model: ๋ชจ๋ธ ์ด๋ฆ„ (์‚ฌ์šฉ ์•ˆ ํ•จ, ํ˜ธํ™˜์„ฑ์šฉ)
297
+ top_k: ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ ๋ฌธ์„œ ์ˆ˜
298
+ n_gpu_layers: GPU ๋ ˆ์ด์–ด ์ˆ˜ (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
299
+ n_ctx: ์ปจํ…์ŠคํŠธ ๊ธธ์ด (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
300
+ n_threads: CPU ์Šค๋ ˆ๋“œ ์ˆ˜ (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
301
+ max_new_tokens: ์ตœ๋Œ€ ์ƒ์„ฑ ํ† ํฐ (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
302
+ temperature: ์ƒ์„ฑ ๋‹ค์–‘์„ฑ (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
303
+ top_p: Nucleus sampling (config ์˜ค๋ฒ„๋ผ์ด๋“œ)
304
  search_mode: ๊ฒ€์ƒ‰ ๋ชจ๋“œ
 
305
  alpha: ์ž„๋ฒ ๋”ฉ ๊ฐ€์ค‘์น˜
306
  """
307
  self.config = config or RAGConfig()
308
+ self.top_k = top_k or self.config.DEFAULT_TOP_K
309
+
310
+ # ๊ฒ€์ƒ‰ ์„ค์ •
311
+ self.search_mode = search_mode or self.config.DEFAULT_SEARCH_MODE
312
+ self.alpha = alpha if alpha is not None else self.config.DEFAULT_ALPHA
313
+
314
+ # Retriever ์ดˆ๊ธฐํ™” (RAGRetriever ์‚ฌ์šฉ)
315
+ logger.info("RAGRetriever ์ดˆ๊ธฐํ™” ์ค‘...")
316
+ from src.retriever.retriever import RAGRetriever
317
+ self.retriever = RAGRetriever(config=self.config)
318
+
319
+ # GGUF ์„ค์ • (ํŒŒ๋ผ๋ฏธํ„ฐ๊ฐ€ ์ฃผ์–ด์ง€๋ฉด config ์˜ค๋ฒ„๋ผ์ด๋“œ)
320
+ gguf_n_gpu_layers = n_gpu_layers if n_gpu_layers is not None else self.config.GGUF_N_GPU_LAYERS
321
+ gguf_n_ctx = n_ctx if n_ctx is not None else self.config.GGUF_N_CTX
322
+ gguf_n_threads = n_threads if n_threads is not None else self.config.GGUF_N_THREADS
323
+ gguf_max_new_tokens = max_new_tokens if max_new_tokens is not None else self.config.GGUF_MAX_NEW_TOKENS
324
+ gguf_temperature = temperature if temperature is not None else self.config.GGUF_TEMPERATURE
325
+ gguf_top_p = top_p if top_p is not None else self.config.GGUF_TOP_P
326
+
327
+ # GGUFGenerator ์ดˆ๊ธฐํ™”
328
+ logger.info("GGUFGenerator ์ดˆ๊ธฐํ™” ์ค‘...")
329
+ logger.info(f" GPU ๋ ˆ์ด์–ด: {gguf_n_gpu_layers}")
330
+ logger.info(f" ์ปจํ…์ŠคํŠธ: {gguf_n_ctx}")
331
+ logger.info(f" ์Šค๋ ˆ๋“œ: {gguf_n_threads}")
332
 
 
333
  self.generator = GGUFGenerator(
334
  model_path=self.config.GGUF_MODEL_PATH,
335
+ n_gpu_layers=gguf_n_gpu_layers,
336
+ n_ctx=gguf_n_ctx,
337
+ n_threads=gguf_n_threads,
338
  config=self.config,
339
+ max_new_tokens=gguf_max_new_tokens,
340
+ temperature=gguf_temperature,
341
+ top_p=gguf_top_p,
342
  system_prompt=self.config.SYSTEM_PROMPT
343
  )
344
 
 
358
 
359
  def _retrieve_and_format(self, query: str) -> str:
360
  """๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ ๋ฐ ์ปจํ…์ŠคํŠธ ํฌ๋งทํŒ…"""
361
+ # ๊ฒ€์ƒ‰ ๋ชจ๋“œ์— ๋”ฐ๋ผ ๋ฌธ์„œ ๊ฒ€์ƒ‰ (RAGRetriever ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ)
362
  if self.search_mode == "embedding":
363
  docs = self.retriever.search(query, top_k=self.top_k)
364
  elif self.search_mode == "embedding_rerank":