Spaces:

bissal
/

clovax-tax-chatbot

Paused

File size: 16,251 Bytes

4aa05c6

# llm_processor.py - LLM 처리 모듈
import os
import re
import time
from datetime import datetime
import logging

# HuggingFace 관련 import
try:
    from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        LlamaConfig,
        LlamaForCausalLM,
        BitsAndBytesConfig
    )
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    print("⚠️ Transformers 라이브러리가 설치되지 않았습니다")
    TRANSFORMERS_AVAILABLE = False

class TaxRuleEngine:
    """취득세 계산 엔진 (노트북에서 추출)"""

    def __init__(self):
        # 조정대상지역 (서울 주요 지역)
        self.adjustment_areas = [
            "강남구", "서초구", "송파구", "용산구"
        ]

        # 다주택 중과세 세율 (천분의)
        self.multi_housing_rates = {
            "1세대2주택_조정대상": 80,      # 8%
            "1세대3주택_조정대상": 120,     # 12%
            "1세대4주택이상_조정대상": 120,  # 12%
            "1세대3주택_조정대상외": 80,    # 8%
            "1세대4주택이상_조정대상외": 120, # 12%
        }

    def calculate_housing_tax_rate(self, acquisition_value):
        """주택 취득세율 계산 (지방세법 제11조 제8호)"""
        if acquisition_value <= 600000000:  # 6억원 이하
            return 10
        elif acquisition_value <= 900000000:  # 6억 초과 9억 이하
            excess = acquisition_value - 600000000
            rate = (excess / 300000000) * 20 + 10
            return round(rate, 4)
        else:  # 9억 초과
            return 30

    def is_adjustment_area(self, location):
        """조정대상지역 여부 판단"""
        return any(area in location for area in self.adjustment_areas)

    def determine_multi_housing_heavy_tax(self, total_housing_count, is_adjustment_area, acquisition_type="매매"):
        """다주택 중과세 유형 결정"""
        if acquisition_type in ['상속', '증여', '무상취득']:
            if is_adjustment_area and total_housing_count >= 2:
                return '조정지역고가주택증여'  # 12%
            return None

        if total_housing_count <= 1:
            return None
        elif total_housing_count == 2:
            return '1세대2주택_조정대상' if is_adjustment_area else None
        elif total_housing_count == 3:
            return '1세대3주택_조정대상' if is_adjustment_area else '1세대3주택_조정대상외'
        else:  # 4주택 이상
            return '1세대4주택이상_조정대상' if is_adjustment_area else '1세대4주택이상_조정대상외'

    def calculate_comprehensive_tax(self, property_info):
        """종합 취득세 계산"""
        if not property_info.get('acquisition_value'):
            return None

        # 기본 세율 계산
        base_rate = self.calculate_housing_tax_rate(property_info['acquisition_value'])

        # 주택수 및 조정대상지역 확인
        total_housing_count = len(property_info.get('housing_list', [])) + 1
        is_adjustment_area = self.is_adjustment_area(property_info.get('location', ''))

        # 중과세 결정
        heavy_tax_type = property_info.get('heavy_tax_type')
        if not heavy_tax_type:
            heavy_tax_type = self.determine_multi_housing_heavy_tax(
                total_housing_count,
                is_adjustment_area,
                property_info.get('acquisition_type', '매매')
            )

        # 최종 세율 결정
        final_rate = base_rate
        if heavy_tax_type and heavy_tax_type in self.multi_housing_rates:
            final_rate = self.multi_housing_rates[heavy_tax_type]
        elif heavy_tax_type == '조정지역고가주택증여':
            final_rate = 120  # 12%

        # 면세점 확인 (50만원 이하)
        if property_info['acquisition_value'] <= 500000:
            tax_amount = 0
        else:
            tax_amount = int(property_info['acquisition_value'] * (final_rate / 1000))

        return {
            'tax_amount': tax_amount,
            'base_rate': base_rate,
            'final_rate': final_rate,
            'heavy_tax_type': heavy_tax_type,
            'is_adjustment_area': is_adjustment_area,
            'total_housing_count': total_housing_count,
            'acquisition_value': property_info['acquisition_value']
        }

class LLMProcessor:
    """HyperCLOVA X 기반 LLM 처리 모듈"""

    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.tax_engine = TaxRuleEngine()
        self.is_initialized = False
        self.device = 'cpu'
        
        # 시스템 프롬프트
        self.system_prompt = """당신은 대한민국 지방세법 취득세 전문가입니다.

주요 역할:
1. 취득세 관련 질문에 정확하고 상세한 답변 제공
2. 지방세법 제2장 취득세 규정 기준 해석
3. 다주택 보유시 중과세 계산 및 설명
4. 조정대상지역 여부에 따른 세율 차이 설명
5. 주택수 산정 기준 (시행령 제28조의4) 적용

답변 형식:
- 해당 법령 조항 명시
- 구체적인 계산 과정 설명
- 절세 방안 제시 (합법적 범위 내)
- 신고 기한 및 유의사항 안내

전문적이고 친절한 톤으로 답변하세요."""

    def initialize_model(self, force_cpu=False):
        """HyperCLOVA X 모델 초기화"""
        if not TRANSFORMERS_AVAILABLE:
            print("❌ Transformers 라이브러리를 설치해주세요: pip install transformers torch")
            return False

        if self.is_initialized:
            return True

        print("🔄 HyperCLOVA X 1.5B 모델 초기화 중...")

        try:
            # HuggingFace 토큰 확인
            hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
            if not hf_token:
                print("⚠️ HuggingFace 토큰이 필요합니다")
                return False

            # 디바이스 설정
            if force_cpu or not torch.cuda.is_available():
                self.device = 'cpu'
                print("💻 CPU 모드로 실행")
            else:
                self.device = 'cuda'
                print(f"🔥 GPU 모드로 실행: {torch.cuda.get_device_name()}")

            model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-1.5B"

            # Config 로드
            config = LlamaConfig.from_pretrained(model_name, token=hf_token)

            # Tokenizer 로드
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                token=hf_token,
                legacy=False,
                add_eos_token=True,
                add_bos_token=True
            )

            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            # 모델 로드
            if self.device == 'cuda':
                # GPU: 8bit 양자화
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True,
                    llm_int8_threshold=6.0
                )

                self.model = LlamaForCausalLM.from_pretrained(
                    model_name,
                    config=config,
                    quantization_config=quantization_config,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    token=hf_token,
                    low_cpu_mem_usage=True
                )
            else:
                # CPU: float32
                self.model = LlamaForCausalLM.from_pretrained(
                    model_name,
                    config=config,
                    torch_dtype=torch.float32,
                    token=hf_token,
                    low_cpu_mem_usage=True
                )
                self.model = self.model.to('cpu')

            self.is_initialized = True
            print(f"✅ HyperCLOVA X 모델 초기화 완료 ({self.device})")
            
            return True

        except Exception as e:
            print(f"❌ 모델 초기화 실패: {e}")
            return False

    def extract_property_info(self, user_input):
        """사용자 입력에서 부동산 정보 자동 추출"""
        property_info = {
            'property_type': '주택',
            'acquisition_type': '매매',
            'acquisition_value': None,
            'location': '',
            'housing_list': []
        }

        # 금액 추출 (다양한 단위 지원)
        amount_patterns = [
            (r'(\d+(?:\.\d+)?)억', 100000000),
            (r'(\d+(?:,\d+)?)만원', 10000),
        ]

        for pattern, multiplier in amount_patterns:
            amounts = re.findall(pattern, user_input)
            if amounts:
                amount_str = amounts[0].replace(',', '')
                property_info['acquisition_value'] = int(float(amount_str) * multiplier)
                break

        # 지역 추출
        for area in self.tax_engine.adjustment_areas:
            area_name = area.replace('구', '')
            if area_name in user_input or area in user_input:
                property_info['location'] = f'서울특별시 {area}'
                break

        # 주택수 추출
        housing_patterns = [r'(\d+)주택', r'기존.*?(\d+).*?주택', r'(\d+).*?보유']
        for pattern in housing_patterns:
            matches = re.findall(pattern, user_input)
            if matches:
                existing_count = int(matches[0]) - 1
                for i in range(max(0, existing_count)):
                    property_info['housing_list'].append({
                        'id': f'existing_house_{i+1}',
                        'type': '주택',
                        'acquisition_type': '매매',
                        'value': 500000000
                    })
                break

        return property_info

    def format_tax_result(self, result, property_info):
        """계산 결과를 사용자 친화적으로 포맷팅"""
        if not result:
            return "📋 정확한 계산을 위해 부동산 가격을 구체적으로 알려주시면 도움이 됩니다."

        output = f"""📋 **취득세 계산 결과**
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🏠 **취득가액**: {result['acquisition_value']:,}원
🏘️ **총 주택수**: {result['total_housing_count']}주택
📍 **조정대상지역**: {'예' if result['is_adjustment_area'] else '아니오'}

💰 **세율 정보**
   • 기본세율: {result['base_rate']}‰ ({result['base_rate']/10:.1f}%)
   • 최종세율: {result['final_rate']}‰ ({result['final_rate']/10:.1f}%)

💸 **취득세액**: {result['tax_amount']:,}원"""

        if result['heavy_tax_type']:
            output += f"\n⚠️ **중과세 적용**: {result['heavy_tax_type']}"

        output += f"""\n\n📜 **법령 근거**
   • 지방세법 제11조 (부동산 취득세)
   • 지방세법 제13조 (중과세)
   • 지방세법 시행령 제28조의4 (주택수 산정)
   • 신고기한: 취득일로부터 60일 이내"""

        return output

    def generate_ai_response(self, user_input, rag_context="", max_length=300):
        """AI 응답 생성 (RAG 컨텍스트 포함)"""
        if not self.is_initialized:
            print("⚠️ 모델이 초기화되지 않았습니다. 초기화를 시도합니다...")
            if not self.initialize_model():
                return "❌ AI 모델 초기화에 실패했습니다."

        try:
            # 1. 자동 계산
            property_info = self.extract_property_info(user_input)
            tax_result = None
            tax_summary = ""

            if property_info.get('acquisition_value'):
                property_info['acquisition_date'] = datetime.now().strftime('%Y-%m-%d')
                tax_result = self.tax_engine.calculate_comprehensive_tax(property_info)
                tax_summary = self.format_tax_result(tax_result, property_info)

            # 2. AI 답변 생성을 위한 프롬프트 구성
            context_parts = []
            
            if rag_context:
                context_parts.append(f"참고 자료:\n{rag_context}")
            
            if tax_summary:
                context_parts.append(f"자동 계산 결과:\n{tax_summary}")

            context_prompt = f"""{self.system_prompt}

사용자 질문: {user_input}

{chr(10).join(context_parts)}

위 정보를 바탕으로 전문가로서 상세하고 이해하기 쉬운 설명을 제공해주세요:"""

            # 3. 토크나이징
            inputs = self.tokenizer(
                context_prompt,
                return_tensors="pt",
                max_length=1800,
                truncation=True
            ).to(self.model.device)

            # 4. AI 응답 생성
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=max_length,
                    do_sample=True,
                    temperature=0.6,
                    top_p=0.85,
                    repetition_penalty=1.15,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            # 5. 응답 디코딩
            generated_response = self.tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            ).strip()

            # 6. 최종 응답 구성
            final_response = ""
            
            if tax_summary:
                final_response += f"{tax_summary}\n\n"

            final_response += f"""🤖 **AI 전문가 상세 설명**
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{generated_response}

---
💡 **추가 문의나 다른 상황에 대한 상담이 필요하시면 언제든 말씀해 주세요!**"""

            return final_response

        except Exception as e:
            error_response = f"❌ AI 응답 생성 중 오류가 발생했습니다: {str(e)}\n\n"
            if tax_summary:
                return error_response + tax_summary
            return error_response + "기본적인 취득세 정보는 지방세법 제11조를 참고하세요."

    def process_with_rag(self, user_input, rag_documents):
        """RAG 문서와 함께 처리"""
        # RAG 문서를 컨텍스트로 변환
        if rag_documents and len(rag_documents) > 0:
            rag_context = "\n\n".join([doc.get('content', '') for doc in rag_documents])
        else:
            rag_context = ""

        return self.generate_ai_response(user_input, rag_context)

# 전역 인스턴스
_llm_processor = None

def get_llm_processor():
    """LLM 프로세서 싱글턴 인스턴스 반환"""
    global _llm_processor
    if _llm_processor is None:
        _llm_processor = LLMProcessor()
    return _llm_processor

def is_llm_available():
    """LLM 시스템 사용 가능 여부 확인"""
    return TRANSFORMERS_AVAILABLE and torch.cuda.is_available()

def process_with_llm(user_input, rag_documents=None):
    """편의 함수: RAG 결과와 함께 LLM 처리"""
    processor = get_llm_processor()
    
    if rag_documents:
        return processor.process_with_rag(user_input, rag_documents)
    else:
        return processor.generate_ai_response(user_input)

if __name__ == "__main__":
    # 테스트 코드
    print("🧪 LLM 프로세서 테스트")
    
    processor = LLMProcessor()
    
    # 초기화 테스트
    if processor.initialize_model(force_cpu=True):
        print("✅ 모델 초기화 성공")
        
        # 간단한 테스트
        test_input = "강남구 10억원 아파트 3주택자 취득세"
        response = processor.generate_ai_response(test_input)
        print(f"응답: {response[:100]}...")
    else:
        print("❌ 모델 초기화 실패")