gbrabbit commited on
Commit
eddb502
ยท
1 Parent(s): 159a5fc

Auto commit at 09-2025-08 7:58:00

Browse files
lily_llm_api/app_v2.py CHANGED
@@ -65,7 +65,6 @@ app.add_middleware(
65
  "http://127.0.0.1:8001",
66
  "http://localhost:3000",
67
  "http://127.0.0.1:3000",
68
- "https://hearthchat-production.up.railway.app",
69
  "*" # ๊ฐœ๋ฐœ ์ค‘์—๋Š” ๋ชจ๋“  origin ํ—ˆ์šฉ
70
  ],
71
  allow_credentials=True,
@@ -486,7 +485,7 @@ async def generate_multimodal(
486
  return_tensors="pt",
487
  padding=True,
488
  truncation=True,
489
- max_length=max_length,
490
  )
491
 
492
  if 'token_type_ids' in inputs:
@@ -558,10 +557,10 @@ async def generate_multimodal(
558
  top_k=40,
559
  top_p=top_p,
560
  repetition_penalty=1.1,
561
- # no_repeat_ngram_size=2,
562
  pad_token_id=tokenizer.eos_token_id,
563
  eos_token_id=tokenizer.eos_token_id,
564
- # use_cache=True
565
  )
566
  logger.info("โœ… ์‹ค์ œ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ƒ์„ฑ ์„ฑ๊ณต!")
567
 
@@ -579,9 +578,9 @@ async def generate_multimodal(
579
  enhanced_inputs = tokenizer(
580
  enhanced_formatted_prompt,
581
  return_tensors="pt",
582
- # padding=True,
583
- # truncation=True,
584
- max_length=max_length
585
  )
586
 
587
  if 'token_type_ids' in enhanced_inputs:
@@ -598,10 +597,10 @@ async def generate_multimodal(
598
  top_k=40,
599
  top_p=top_p,
600
  repetition_penalty=1.1,
601
- # no_repeat_ngram_size=2,
602
  pad_token_id=tokenizer.eos_token_id,
603
  eos_token_id=tokenizer.eos_token_id,
604
- # use_cache=True
605
  )
606
  else:
607
  # ํ…์ŠคํŠธ ์ „์šฉ ์ƒ์„ฑ
@@ -614,11 +613,11 @@ async def generate_multimodal(
614
  temperature=temperature,
615
  top_k=40,
616
  top_p=top_p,
617
- # repetition_penalty=1.1,
618
- # no_repeat_ngram_size=2,
619
  pad_token_id=tokenizer.eos_token_id,
620
  eos_token_id=tokenizer.eos_token_id,
621
- # use_cache=True
622
  )
623
 
624
  # ์‘๋‹ต ์ถ”์ถœ
 
65
  "http://127.0.0.1:8001",
66
  "http://localhost:3000",
67
  "http://127.0.0.1:3000",
 
68
  "*" # ๊ฐœ๋ฐœ ์ค‘์—๋Š” ๋ชจ๋“  origin ํ—ˆ์šฉ
69
  ],
70
  allow_credentials=True,
 
485
  return_tensors="pt",
486
  padding=True,
487
  truncation=True,
488
+ max_length=100,
489
  )
490
 
491
  if 'token_type_ids' in inputs:
 
557
  top_k=40,
558
  top_p=top_p,
559
  repetition_penalty=1.1,
560
+ no_repeat_ngram_size=2,
561
  pad_token_id=tokenizer.eos_token_id,
562
  eos_token_id=tokenizer.eos_token_id,
563
+ use_cache=True
564
  )
565
  logger.info("โœ… ์‹ค์ œ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ƒ์„ฑ ์„ฑ๊ณต!")
566
 
 
578
  enhanced_inputs = tokenizer(
579
  enhanced_formatted_prompt,
580
  return_tensors="pt",
581
+ padding=True,
582
+ truncation=True,
583
+ max_length=256
584
  )
585
 
586
  if 'token_type_ids' in enhanced_inputs:
 
597
  top_k=40,
598
  top_p=top_p,
599
  repetition_penalty=1.1,
600
+ no_repeat_ngram_size=2,
601
  pad_token_id=tokenizer.eos_token_id,
602
  eos_token_id=tokenizer.eos_token_id,
603
+ use_cache=True
604
  )
605
  else:
606
  # ํ…์ŠคํŠธ ์ „์šฉ ์ƒ์„ฑ
 
613
  temperature=temperature,
614
  top_k=40,
615
  top_p=top_p,
616
+ repetition_penalty=1.1,
617
+ no_repeat_ngram_size=2,
618
  pad_token_id=tokenizer.eos_token_id,
619
  eos_token_id=tokenizer.eos_token_id,
620
+ use_cache=True
621
  )
622
 
623
  # ์‘๋‹ต ์ถ”์ถœ
lily_llm_api/models/kanana_1_5_v_3b_instruct.py CHANGED
@@ -16,8 +16,6 @@ HF_TOKEN = os.getenv("HF_TOKEN")
16
  logger = logging.getLogger(__name__)
17
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
- max_new_tokens = 64
20
-
21
  class Kanana15V3bInstructProfile:
22
  """Kanana-1.5-v-3b-instruct ๋ชจ๋ธ ํ”„๋กœํ•„"""
23
 
@@ -29,11 +27,11 @@ class Kanana15V3bInstructProfile:
29
  if self.is_local:
30
  self.model_name = "gbrabbit/lily-math-model" # ๋กœ์ปฌ์—์„œ๋„ HF ๋ชจ๋ธ๋ช… ์‚ฌ์šฉ
31
  self.local_path = "./lily_llm_core/models/kanana_1_5_v_3b_instruct"
32
- self.display_name = "Kanana-1.5-v-3b-instruct (๋กœ์ปฌ)"
33
  else:
34
  self.model_name = "gbrabbit/lily-math-model" # Hugging Face Hub ๋ชจ๋ธ ๊ฒฝ๋กœ
35
  self.local_path = None # ์„œ๋ฒ„์—์„œ๋Š” ๋กœ์ปฌ ๊ฒฝ๋กœ ์‚ฌ์šฉ ์•ˆํ•จ
36
- self.display_name = "Kanana-1.5-v-3b-instruct (์„œ๋ฒ„)"
37
 
38
  self.description = "์นด์นด์˜ค ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ชจ๋ธ (3.6B) - Math RAG ํŠนํ™”"
39
  self.language = "ko"
@@ -183,7 +181,7 @@ class Kanana15V3bInstructProfile:
183
  token=HF_TOKEN,
184
  torch_dtype=torch.float16,
185
  trust_remote_code=True,
186
- # cache_dir="/app/cache/transformers",
187
  # device_map="auto",
188
  # low_cpu_mem_usage=True,
189
  ).to(DEVICE)
@@ -199,7 +197,7 @@ class Kanana15V3bInstructProfile:
199
 
200
  def get_generation_config(self) -> Dict[str, Any]:
201
  # ๋ชจ๋ธ ํŒŒ๋ผ๋ฏธํ„ฐ ์ตœ์ ํ™” ์„ค์ •, max_new_tokens : ์ƒ์„ฑ๋˜๋Š” ํ…์ŠคํŠธ ๊ธธ์ด ์ตœ๋Œ€๊ฐ’ (์ด๋ฏธ์ง€ ์„ค๋ช…์„ ์œ„ํ•ด ์ฆ๊ฐ€)
202
- return {"max_new_tokens": max_new_tokens, "temperature": 0.7, "do_sample": True, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.1}
203
 
204
  def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
205
  """
 
16
  logger = logging.getLogger(__name__)
17
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
 
 
 
19
  class Kanana15V3bInstructProfile:
20
  """Kanana-1.5-v-3b-instruct ๋ชจ๋ธ ํ”„๋กœํ•„"""
21
 
 
27
  if self.is_local:
28
  self.model_name = "gbrabbit/lily-math-model" # ๋กœ์ปฌ์—์„œ๋„ HF ๋ชจ๋ธ๋ช… ์‚ฌ์šฉ
29
  self.local_path = "./lily_llm_core/models/kanana_1_5_v_3b_instruct"
30
+ self.display_name = "kanana-1.5-v-3b-instruct"
31
  else:
32
  self.model_name = "gbrabbit/lily-math-model" # Hugging Face Hub ๋ชจ๋ธ ๊ฒฝ๋กœ
33
  self.local_path = None # ์„œ๋ฒ„์—์„œ๋Š” ๋กœ์ปฌ ๊ฒฝ๋กœ ์‚ฌ์šฉ ์•ˆํ•จ
34
+ self.display_name = "kanana-1.5-v-3b-instruct"
35
 
36
  self.description = "์นด์นด์˜ค ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๋ชจ๋ธ (3.6B) - Math RAG ํŠนํ™”"
37
  self.language = "ko"
 
181
  token=HF_TOKEN,
182
  torch_dtype=torch.float16,
183
  trust_remote_code=True,
184
+ cache_dir="/app/cache/transformers",
185
  # device_map="auto",
186
  # low_cpu_mem_usage=True,
187
  ).to(DEVICE)
 
197
 
198
  def get_generation_config(self) -> Dict[str, Any]:
199
  # ๋ชจ๋ธ ํŒŒ๋ผ๋ฏธํ„ฐ ์ตœ์ ํ™” ์„ค์ •, max_new_tokens : ์ƒ์„ฑ๋˜๋Š” ํ…์ŠคํŠธ ๊ธธ์ด ์ตœ๋Œ€๊ฐ’ (์ด๋ฏธ์ง€ ์„ค๋ช…์„ ์œ„ํ•ด ์ฆ๊ฐ€)
200
+ return {"max_new_tokens": 256, "temperature": 0.7, "do_sample": True, "top_k": 40, "top_p": 0.9, "repetition_penalty": 1.1}
201
 
202
  def extract_response(self, full_text: str, formatted_prompt: str = None, **kwargs) -> str:
203
  """