Spaces:

gbrabbit
/

lily_fast_api

Sleeping

App Files Files Community

gbrabbit commited on Aug 20, 2025

Commit

d098bcd

1 Parent(s): b08dfac

Auto commit at 20-2025-08 20:10:37

Browse files

Files changed (3) hide show

lily_llm_api/app_v2.py +12 -2
lily_llm_api/models/polyglot_ko_1_3b_chat.py +101 -22
lily_llm_api/models/polyglot_ko_5_8b_chat.py +153 -28

lily_llm_api/app_v2.py CHANGED Viewed

@@ -191,7 +191,7 @@ def configure_cpu_threads():
         else:
             detected = os.cpu_count() or 2
             # 컨테이너/서버의 vCPU 수를 그대로 사용하되 상한 8 적용
-            threads = max(1, min(detected, 8))
         # OpenMP/MKL/numexpr
         os.environ["OMP_NUM_THREADS"] = str(threads)
@@ -225,7 +225,7 @@ def select_model_interactive():
         try:
             # choice = input(f"\n📝 사용할 모델 번호를 선택하세요 (1-{len(available_models)}): ")
             # selected_model = available_models[int(choice) - 1]
-            selected_model = available_models[0]
             print(f"\n✅ '{selected_model['name']}' 모델을 선택했습니다.")
             return selected_model['model_id']
         except (ValueError, IndexError):
@@ -540,6 +540,16 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
                     gen_config['use_cache'] = True  # 캐시 사용으로 속도 향상
                     gen_config['pad_token_id'] = tokenizer.eos_token_id if tokenizer.eos_token_id else None
                     print(f"🔍 [DEBUG] 모델 생성 시작 - 텍스트만")
                     print(f"🔍 [DEBUG] 최종 입력 텐서 디바이스: {input_ids.device}")
                     print(f"🔍 [DEBUG] 최종 attention_mask 디바이스: {attention_mask.device}")

         else:
             detected = os.cpu_count() or 2
             # 컨테이너/서버의 vCPU 수를 그대로 사용하되 상한 8 적용
+            threads = max(1, min(detected, 16))
         # OpenMP/MKL/numexpr
         os.environ["OMP_NUM_THREADS"] = str(threads)
         try:
             # choice = input(f"\n📝 사용할 모델 번호를 선택하세요 (1-{len(available_models)}): ")
             # selected_model = available_models[int(choice) - 1]
+            selected_model = available_models[2]
             print(f"\n✅ '{selected_model['name']}' 모델을 선택했습니다.")
             return selected_model['model_id']
         except (ValueError, IndexError):
                     gen_config['use_cache'] = True  # 캐시 사용으로 속도 향상
                     gen_config['pad_token_id'] = tokenizer.eos_token_id if tokenizer.eos_token_id else None
+                    # EOS 토큰 강제 설정 - 문장 끝 문제 해결
+                    if tokenizer.eos_token_id is not None:
+                        gen_config['eos_token_id'] = tokenizer.eos_token_id
+                        print(f"🔍 [DEBUG] EOS 토큰 강제 설정: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
+                    else:
+                        print(f"⚠️ [DEBUG] EOS 토큰이 설정되지 않음")
+                    # 생성 설정 최종 확인
+                    print(f"🔍 [DEBUG] 최종 생성 설정: {gen_config}")
                     print(f"🔍 [DEBUG] 모델 생성 시작 - 텍스트만")
                     print(f"🔍 [DEBUG] 최종 입력 텐서 디바이스: {input_ids.device}")
                     print(f"🔍 [DEBUG] 최종 attention_mask 디바이스: {attention_mask.device}")

lily_llm_api/models/polyglot_ko_1_3b_chat.py CHANGED Viewed

@@ -25,7 +25,7 @@ class PolyglotKo13bChatProfile:
         self.model_size = "1.3B"
     def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
-        """모델 로드 (로컬 우선, 없으면 Hub)"""
         logger.info(f"📥 {self.display_name} 모델 로드 중...")
         try:
             use_local = Path(self.local_path).exists() and any(Path(self.local_path).iterdir())
@@ -39,8 +39,21 @@ class PolyglotKo13bChatProfile:
                 trust_remote_code=True,
                 local_files_only=use_local,
             )
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
             # CPU에서는 float32가 더 안정적, CUDA에서는 float16 사용
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -60,9 +73,9 @@ class PolyglotKo13bChatProfile:
             raise
     def format_prompt(self, user_input: str) -> str:
-        """프롬프트 포맷팅 - 채팅 형식 (최적화)"""
-        # 더 간결한 프롬프트로 토큰 수 줄임
-        prompt = f"""AI 챗봇입니다. 도움이 되고 유익한 내용을 제공하세요.
 ### 사용자:
 {user_input}
@@ -72,7 +85,7 @@ class PolyglotKo13bChatProfile:
         return prompt
     def extract_response(self, full_text: str, formatted_prompt: str = None) -> str:
-        """응답 추출 - 더 강력한 로직으로 개선"""
         logger.info(f"--- Polyglot 응답 추출 시작 ---")
         logger.info(f"전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
         logger.info(f"사용된 프롬프트: {formatted_prompt}")
@@ -82,21 +95,34 @@ class PolyglotKo13bChatProfile:
             response = full_text.split("### 챗봇:")[-1].strip()
             logger.info(f"✅ 성공: '### 챗봇:' 태그로 응답 추출")
             logger.info(f"추출된 응답: {response}")
-            if response:  # 빈 문자열이 아닌 경우에만 반환
                 return response
         # 2순위: 프롬프트 제거로 추출 시도
         if formatted_prompt and formatted_prompt in full_text:
             response = full_text.replace(formatted_prompt, "").strip()
             logger.info(f"✅ 성공: 프롬프트 제거로 응답 추출")
             logger.info(f"추출된 응답: {response}")
-            if response:  # 빈 문자열이 아닌 경우에만 반환
                 return response
         # 3순위: 일반적인 프롬프트 패턴 제거 시도
         clean_text = full_text.strip()
         patterns_to_remove = [
-            "당신은 AI 챗봇입니다. 사용자에게 도움이 되고 유익한 내용을 제공해야합니다. 답변은 길고 자세하며 친절한 설명을 덧붙여서 작성하세요.",
             "### 사용자:",
             "### 챗봇:",
             "사용자:",
@@ -113,28 +139,81 @@ class PolyglotKo13bChatProfile:
         if clean_text and clean_text != full_text:
             logger.info("✅ 성공: 패턴 제거로 응답 정리")
             logger.info(f"정리된 응답: {clean_text}")
-            return clean_text
         # 4순위: 전체 텍스트에서 불필요한 부분만 제거
         final_response = full_text.strip()
         logger.warning("⚠️ 경고: 특별한 응답 추출 패턴을 찾지 못했습니다. 전체 텍스트를 정리하여 반환합니다.")
         logger.info(f"최종 반환 텍스트: {final_response}")
-        return final_response
     def get_generation_config(self) -> Dict[str, Any]:
-        """생성 설정 - 속도 최적화"""
         return {
-            "max_new_tokens": 64,           # 128에서 64로 줄여서 속도 향상
-            "temperature": 0.7,             # 적당한 창의성 유지
-            "do_sample": True,              # 샘플링 활성화
-            "top_k": 40,                    # 50에서 40으로 줄여서 속도 향상
-            "top_p": 0.9,                   # nucleus sampling 유지
-            "repetition_penalty": 1.1,      # 반복 방지
-            "no_repeat_ngram_size": 3,      # n-gram 반복 방지
-            "pad_token_id": None,           # 모델 기본값 사용
-            "eos_token_id": None,           # 모델 기본값 사용
-            "use_cache": True,              # 캐시 사용으로 속도 향상
-            "max_time": 60.0,               # 60초 타임아웃
         }
     def get_model_info(self) -> Dict[str, Any]:

         self.model_size = "1.3B"
     def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+        """모델 로드 (토크나이저 설정 수정)"""
         logger.info(f"📥 {self.display_name} 모델 로드 중...")
         try:
             use_local = Path(self.local_path).exists() and any(Path(self.local_path).iterdir())
                 trust_remote_code=True,
                 local_files_only=use_local,
             )
+            # 토크나이저 설정 수정 - EOS 토큰 문제 해결
+            if tokenizer.eos_token is None:
+                logger.warning("⚠️ EOS 토큰이 없습니다. 모델 공식 문서에 따라 <|endoftext|> 설정")
+                tokenizer.eos_token = "<|endoftext|>"
             if tokenizer.pad_token is None:
+                logger.warning("⚠️ PAD 토큰이 없습니다. EOS 토큰으로 설정")
                 tokenizer.pad_token = tokenizer.eos_token
+            # 특수 토큰 확인
+            logger.info(f"🔍 토크나이저 설정:")
+            logger.info(f"  - EOS 토큰: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
+            logger.info(f"  - PAD 토큰: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
+            logger.info(f"  - BOS 토큰: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
             # CPU에서는 float32가 더 안정적, CUDA에서는 float16 사용
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
             raise
     def format_prompt(self, user_input: str) -> str:
+        """프롬프트 포맷팅 - 공식 문서와 일치"""
+        # Hugging Face 모델 페이지의 공식 프롬프트 형식 사용
+        prompt = f"""당신은 AI 챗봇입니다. 사용자에게 도움이 되고 유익한 내용을 제공해야합니다. 답변은 길고 자세하며 친절한 설명을 덧붙여서 작성하세요.
 ### 사용자:
 {user_input}
         return prompt
     def extract_response(self, full_text: str, formatted_prompt: str = None) -> str:
+        """응답 추출 - 품질 검증 및 개선"""
         logger.info(f"--- Polyglot 응답 추출 시작 ---")
         logger.info(f"전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
         logger.info(f"사용된 프롬프트: {formatted_prompt}")
             response = full_text.split("### 챗봇:")[-1].strip()
             logger.info(f"✅ 성공: '### 챗봇:' 태그로 응답 추출")
             logger.info(f"추출된 응답: {response}")
+            # 응답 품질 검증
+            if self._validate_response_quality(response):
                 return response
+            else:
+                logger.warning("⚠️ 응답 품질이 낮습니다. 품질 개선 제안을 추가합니다.")
+                return self._improve_response_quality(response)
         # 2순위: 프롬프트 제거로 추출 시도
         if formatted_prompt and formatted_prompt in full_text:
             response = full_text.replace(formatted_prompt, "").strip()
             logger.info(f"✅ 성공: 프롬프트 제거로 응답 추출")
             logger.info(f"추출된 응답: {response}")
+            if self._validate_response_quality(response):
                 return response
+            else:
+                return self._improve_response_quality(response)
         # 3순위: 일반적인 프롬프트 패턴 제거 시도
         clean_text = full_text.strip()
         patterns_to_remove = [
+            "당신은 한국어 AI 챗봇입니다. 다음 규칙을 엄격히 따라주세요:",
+            "1. 반드시 한국어로만 응답하세요",
+            "2. 자연스럽고 일관성 있는 대화를 유지하세요",
+            "3. 사용자의 질문에 정확하고 도움이 되는 답변을 제공하세요",
+            "4. 문장이 중간에 끊기지 않도록 완성된 답변을 작성하세요",
+            "5. 영어나 다른 언어를 사용하지 마세요",
             "### 사용자:",
             "### 챗봇:",
             "사용자:",
         if clean_text and clean_text != full_text:
             logger.info("✅ 성공: 패턴 제거로 응답 정리")
             logger.info(f"정리된 응답: {clean_text}")
+            if self._validate_response_quality(clean_text):
+                return clean_text
+            else:
+                return self._improve_response_quality(clean_text)
         # 4순위: 전체 텍스트에서 불필요한 부분만 제거
         final_response = full_text.strip()
         logger.warning("⚠️ 경고: 특별한 응답 추출 패턴을 찾지 못했습니다. 전체 텍스트를 정리하여 반환합니다.")
         logger.info(f"최종 반환 텍스트: {final_response}")
+        if self._validate_response_quality(final_response):
+            return final_response
+        else:
+            return self._improve_response_quality(final_response)
+    def _validate_response_quality(self, response: str) -> bool:
+        """응답 품질 검증"""
+        if not response or len(response.strip()) < 5:
+            return False
+        # 영어가 포함되어 있으면 품질 낮음
+        if any(char.isascii() and char.isalpha() for char in response):
+            return False
+        # 문장이 중간에 끊어진 경우 품질 낮음
+        if response.endswith(('하', '는', '을', '를', '이', '가', '의', '에', '로')):
+            return False
+        # 중복된 단어가 많으면 품질 낮음
+        words = response.split()
+        if len(words) > 3 and len(set(words)) / len(words) < 0.7:
+            return False
+        return True
+    def _improve_response_quality(self, response: str) -> str:
+        """응답 품질 개선"""
+        # 기본 정리
+        improved = response.strip()
+        # 영어 제거
+        import re
+        improved = re.sub(r'[a-zA-Z]+', '', improved)
+        # 중복 공백 제거
+        improved = re.sub(r'\s+', ' ', improved)
+        # 문장이 중간에 끊어진 경우 처리
+        if improved.endswith(('하', '는', '을', '를', '이', '가', '의', '에', '로')):
+            improved += '니다.'
+        # 너무 짧은 경우 기본 응답 추가
+        if len(improved) < 10:
+            improved = f"{improved} (응답이 너무 짧습니다. 더 자세한 답변을 원하시면 다시 질문해주세요.)"
+        logger.info(f"🔧 응답 품질 개선 완료: {improved}")
+        return improved
     def get_generation_config(self) -> Dict[str, Any]:
+        """생성 설정 - 공식 EOS 토큰 사용"""
         return {
+            "max_new_tokens": 128,           # 64에서 128로 증가하여 완성된 답변 생성
+            "temperature": 0.3,              # 일관성 향상
+            "do_sample": True,               # 샘플링 활성화
+            "top_k": 20,                     # 품질 향상
+            "top_p": 0.8,                    # 일관성 향상
+            "repetition_penalty": 1.2,       # 반복 방지
+            "no_repeat_ngram_size": 4,       # 반복 방지
+            "pad_token_id": None,            # 모델 기본값 사용
+            "eos_token_id": None,            # None으로 설정하여 모델이 <|endoftext|> 자동 감지
+            "use_cache": True,               # 캐시 사용으로 속도 향상
+            "max_time": 60.0,                # 60초 타임아웃
+            "early_stopping": False,         # False로 설정하여 <|endoftext|>까지 생성
+            "stopping_criteria": None,       # 기본 정지 기준 사용
         }
     def get_model_info(self) -> Dict[str, Any]:

lily_llm_api/models/polyglot_ko_5_8b_chat.py CHANGED Viewed

@@ -1,31 +1,33 @@
 #!/usr/bin/env python3
 """
-heegyu/polyglot-ko-5.8b-chat 모델 프로필
 """
 from typing import Dict, Any, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import logging
 logger = logging.getLogger(__name__)
 class PolyglotKo58bChatProfile:
-    """heegyu/polyglot-ko-5.8b-chat 모델 프로필"""
     def __init__(self):
         self.model_name = "heegyu/polyglot-ko-5.8b-chat"
         self.local_path = "./lily_llm_core/models/polyglot_ko_5_8b_chat"
-        self.display_name = "heegyu/polyglot-ko-5.8b-chat"
-        self.description = "EleutherAI/polyglot-ko-5.8b를 여러 한국어 instruction 데이터셋으로 학습한 모델"
         self.language = "ko"
         self.model_size = "5.8B"
     def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
-        """모델 로드 (로컬 우선, 없으면 Hub)"""
         logger.info(f"📥 {self.display_name} 모델 로드 중...")
         try:
-            from pathlib import Path
             use_local = Path(self.local_path).exists() and any(Path(self.local_path).iterdir())
             model_path = self.local_path if use_local else self.model_name
@@ -37,11 +39,25 @@ class PolyglotKo58bChatProfile:
                 trust_remote_code=True,
                 local_files_only=use_local,
             )
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            selected_dtype = torch.float16 if device == 'cuda' else torch.float16
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
@@ -57,33 +73,142 @@ class PolyglotKo58bChatProfile:
             raise
     def format_prompt(self, user_input: str) -> str:
-        """프롬프트 포맷팅"""
-        return f"### 질문: {user_input}\n\n### 답변:"
-    def extract_response(self, full_text: str, formatted_prompt: str) -> str:
-        """응답 추출"""
-        if "### 답변:" in full_text:
-            response = full_text.split("### 답변:")[-1].strip()
-        else:
-            if formatted_prompt in full_text:
-                response = full_text.replace(formatted_prompt, "").strip()
             else:
-                response = full_text.strip()
-        return response
     def get_generation_config(self) -> Dict[str, Any]:
-        """생성 설정"""
         return {
-            "max_new_tokens": 128,
-            "temperature": 0.7,
-            "do_sample": True,
-            "top_k": 50,
-            "top_p": 0.9,
-            "repetition_penalty": 1.1,
-            "no_repeat_ngram_size": 3,
-            "pad_token_id": None,
-            "eos_token_id": None,
         }
     def get_model_info(self) -> Dict[str, Any]:

 #!/usr/bin/env python3
 """
+Polyglot-ko-5.8b-chat 모델 프로필
+heegyu/polyglot-ko-5.8b-chat 모델용
 """
 from typing import Dict, Any, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import logging
+import os
+from pathlib import Path
 logger = logging.getLogger(__name__)
 class PolyglotKo58bChatProfile:
+    """Polyglot-ko-5.8b-chat 모델 프로필"""
     def __init__(self):
         self.model_name = "heegyu/polyglot-ko-5.8b-chat"
         self.local_path = "./lily_llm_core/models/polyglot_ko_5_8b_chat"
+        self.display_name = "Polyglot-ko-5.8b-chat"
+        self.description = "한국어 채팅 전용 고성능 모델 (5.8B)"
         self.language = "ko"
         self.model_size = "5.8B"
     def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
+        """모델 로드 (토크나이저 설정 수정)"""
         logger.info(f"📥 {self.display_name} 모델 로드 중...")
         try:
             use_local = Path(self.local_path).exists() and any(Path(self.local_path).iterdir())
             model_path = self.local_path if use_local else self.model_name
                 trust_remote_code=True,
                 local_files_only=use_local,
             )
+            # 토크나이저 설정 수정 - EOS 토큰 문제 해결
+            if tokenizer.eos_token is None:
+                logger.warning("⚠️ EOS 토큰이 없습니다. 모델 공식 문서에 따라 <|endoftext|> 설정")
+                tokenizer.eos_token = "<|endoftext|>"
             if tokenizer.pad_token is None:
+                logger.warning("⚠️ PAD 토큰이 없습니다. EOS 토큰으로 설정")
                 tokenizer.pad_token = tokenizer.eos_token
+            # 특수 토큰 확인
+            logger.info(f"🔍 토크나이저 설정:")
+            logger.info(f"  - EOS 토큰: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
+            logger.info(f"  - PAD 토큰: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
+            logger.info(f"  - BOS 토큰: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
+            # CPU에서는 float32가 더 안정적, CUDA에서는 float16 사용
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            selected_dtype = torch.float16 if device == 'cuda' else torch.float32
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
             raise
     def format_prompt(self, user_input: str) -> str:
+        """프롬프트 포맷팅 - 공식 문서와 일치"""
+        # Hugging Face 모델 페이지의 공식 프롬프트 형식 사용
+        prompt = f"""당신은 AI 챗봇입니다. 사용자에게 도움이 되고 유익한 내용을 제공해야합니다. 답변은 길고 자세하며 친절한 설명을 덧붙여서 작성하세요.
+### 사용자:
+{user_input}
+### 챗봇:
+"""
+        return prompt
+    def extract_response(self, full_text: str, formatted_prompt: str = None) -> str:
+        """응답 추출 - 품질 검증 및 개선"""
+        logger.info(f"--- Polyglot 5.8B 응답 추출 시작 ---")
+        logger.info(f"전체 생성 텍스트 (Raw): \n---\n{full_text}\n---")
+        logger.info(f"사용된 프롬프트: {formatted_prompt}")
+        # 1순위: "### 챗봇:" 태그로 추출 시도
+        if "### 챗봇:" in full_text:
+            response = full_text.split("### 챗봇:")[-1].strip()
+            logger.info(f"✅ 성공: '### 챗봇:' 태그로 응답 추출")
+            logger.info(f"추출된 응답: {response}")
+            # 응답 품질 검증
+            if self._validate_response_quality(response):
+                return response
+            else:
+                logger.warning("⚠️ 응답 품질이 낮습니다. 품질 개선 제안을 추가합니다.")
+                return self._improve_response_quality(response)
+        # 2순위: 프롬���트 제거로 추출 시도
+        if formatted_prompt and formatted_prompt in full_text:
+            response = full_text.replace(formatted_prompt, "").strip()
+            logger.info(f"✅ 성공: 프롬프트 제거로 응답 추출")
+            logger.info(f"추출된 응답: {response}")
+            if self._validate_response_quality(response):
+                return response
             else:
+                return self._improve_response_quality(response)
+        # 3순위: 일반적인 프롬프트 패턴 제거 시도
+        clean_text = full_text.strip()
+        patterns_to_remove = [
+            "당신은 AI 챗봇입니다. 사용자에게 도움이 되고 유익한 내용을 제공해야합니다. 답변은 길고 자세하며 친절한 설명을 덧붙여서 작성하세요.",
+            "### 사용자:",
+            "### 챗봇:",
+            "사용자:",
+            "챗봇:",
+            "assistant:",
+            "user:"
+        ]
+        for pattern in patterns_to_remove:
+            clean_text = clean_text.replace(pattern, "")
+        clean_text = clean_text.strip()
+        if clean_text and clean_text != full_text:
+            logger.info("✅ 성공: 패턴 제거로 응답 정리")
+            logger.info(f"정리된 응답: {clean_text}")
+            if self._validate_response_quality(clean_text):
+                return clean_text
+            else:
+                return self._improve_response_quality(clean_text)
+        # 4순위: 전체 텍스트에서 불필요한 부분만 제거
+        final_response = full_text.strip()
+        logger.warning("⚠️ 경고: 특별한 응답 추출 패턴을 찾지 못했습니다. 전체 텍스트를 정리하여 반환합니다.")
+        logger.info(f"최종 반환 텍스트: {final_response}")
+        if self._validate_response_quality(final_response):
+            return final_response
+        else:
+            return self._improve_response_quality(final_response)
+    def _validate_response_quality(self, response: str) -> bool:
+        """응답 품질 검증"""
+        if not response or len(response.strip()) < 5:
+            return False
+        # 영어가 포함되어 있으면 품질 낮음
+        if any(char.isascii() and char.isalpha() for char in response):
+            return False
+        # 문장이 중간에 끊어진 경우 품질 낮음
+        if response.endswith(('하', '는', '을', '를', '이', '가', '의', '에', '로')):
+            return False
+        # 중복된 단어가 많으면 품질 낮음
+        words = response.split()
+        if len(words) > 3 and len(set(words)) / len(words) < 0.7:
+            return False
+        return True
+    def _improve_response_quality(self, response: str) -> str:
+        """응답 품질 개선"""
+        # 기본 정리
+        improved = response.strip()
+        # 영어 제거
+        import re
+        improved = re.sub(r'[a-zA-Z]+', '', improved)
+        # 중복 공백 제거
+        improved = re.sub(r'\s+', ' ', improved)
+        # 문장이 중간에 끊어진 경우 처리
+        if improved.endswith(('하', '는', '을', '를', '이', '가', '의', '에', '로')):
+            improved += '니다.'
+        # 너무 짧은 경우 기본 응답 추가
+        if len(improved) < 10:
+            improved = f"{improved} (응답이 너무 짧습니다. 더 자세한 답변을 원하시면 다시 질문해주세요.)"
+        logger.info(f"🔧 응답 품질 개선 완료: {improved}")
+        return improved
     def get_generation_config(self) -> Dict[str, Any]:
+        """생성 설정 - 공식 EOS 토큰 사용"""
         return {
+            "max_new_tokens": 128,           # 5.8B 모델은 더 긴 응답 생성 가능
+            "temperature": 0.3,              # 일관성 향상
+            "do_sample": True,               # 샘플링 활성화
+            "top_k": 20,                     # 품질 향상
+            "top_p": 0.8,                    # 일관성 향상
+            "repetition_penalty": 1.2,       # 반복 방지
+            "no_repeat_ngram_size": 4,       # 반복 방지
+            "pad_token_id": None,            # 모델 기본값 사용
+            "eos_token_id": None,            # None으로 설정하여 모델이 <|endoftext|> 자동 감지
+            "use_cache": True,               # 캐시 사용으로 속도 향상
+            "max_time": 240.0,               # 5.8B 모델은 더 긴 시간 필요 (120초)
+            "early_stopping": False,         # False로 설정하여 <|endoftext|>까지 생성
+            "stopping_criteria": None,       # 기본 정지 기준 사용
         }
     def get_model_info(self) -> Dict[str, Any]: