Spaces:

bissal
/

clovax-tax-chatbot

Paused

App Files Files Community

clovax-tax-chatbot / llm_processor.py

bissal

Complete RAG→LLM→Web pipeline integration 🤖 Generated with Claude Code

4aa05c6 4 months ago

raw

history blame contribute delete

16.3 kB

	# llm_processor.py - LLM 처리 모듈
	import os
	import re
	import time
	from datetime import datetime
	import logging

	# HuggingFace 관련 import
	try:
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	LlamaConfig,
	LlamaForCausalLM,
	BitsAndBytesConfig
	)
	import torch
	TRANSFORMERS_AVAILABLE = True
	except ImportError:
	print("⚠️ Transformers 라이브러리가 설치되지 않았습니다")
	TRANSFORMERS_AVAILABLE = False

	class TaxRuleEngine:
	"""취득세 계산 엔진 (노트북에서 추출)"""

	def __init__(self):
	# 조정대상지역 (서울 주요 지역)
	self.adjustment_areas = [
	"강남구", "서초구", "송파구", "용산구"
	]

	# 다주택 중과세 세율 (천분의)
	self.multi_housing_rates = {
	"1세대2주택_조정대상": 80, # 8%
	"1세대3주택_조정대상": 120, # 12%
	"1세대4주택이상_조정대상": 120, # 12%
	"1세대3주택_조정대상외": 80, # 8%
	"1세대4주택이상_조정대상외": 120, # 12%
	}

	def calculate_housing_tax_rate(self, acquisition_value):
	"""주택 취득세율 계산 (지방세법 제11조 제8호)"""
	if acquisition_value <= 600000000: # 6억원 이하
	return 10
	elif acquisition_value <= 900000000: # 6억 초과 9억 이하
	excess = acquisition_value - 600000000
	rate = (excess / 300000000) * 20 + 10
	return round(rate, 4)
	else: # 9억 초과
	return 30

	def is_adjustment_area(self, location):
	"""조정대상지역 여부 판단"""
	return any(area in location for area in self.adjustment_areas)

	def determine_multi_housing_heavy_tax(self, total_housing_count, is_adjustment_area, acquisition_type="매매"):
	"""다주택 중과세 유형 결정"""
	if acquisition_type in ['상속', '증여', '무상취득']:
	if is_adjustment_area and total_housing_count >= 2:
	return '조정지역고가주택증여' # 12%
	return None

	if total_housing_count <= 1:
	return None
	elif total_housing_count == 2:
	return '1세대2주택_조정대상' if is_adjustment_area else None
	elif total_housing_count == 3:
	return '1세대3주택_조정대상' if is_adjustment_area else '1세대3주택_조정대상외'
	else: # 4주택 이상
	return '1세대4주택이상_조정대상' if is_adjustment_area else '1세대4주택이상_조정대상외'

	def calculate_comprehensive_tax(self, property_info):
	"""종합 취득세 계산"""
	if not property_info.get('acquisition_value'):
	return None

	# 기본 세율 계산
	base_rate = self.calculate_housing_tax_rate(property_info['acquisition_value'])

	# 주택수 및 조정대상지역 확인
	total_housing_count = len(property_info.get('housing_list', [])) + 1
	is_adjustment_area = self.is_adjustment_area(property_info.get('location', ''))

	# 중과세 결정
	heavy_tax_type = property_info.get('heavy_tax_type')
	if not heavy_tax_type:
	heavy_tax_type = self.determine_multi_housing_heavy_tax(
	total_housing_count,
	is_adjustment_area,
	property_info.get('acquisition_type', '매매')
	)

	# 최종 세율 결정
	final_rate = base_rate
	if heavy_tax_type and heavy_tax_type in self.multi_housing_rates:
	final_rate = self.multi_housing_rates[heavy_tax_type]
	elif heavy_tax_type == '조정지역고가주택증여':
	final_rate = 120 # 12%

	# 면세점 확인 (50만원 이하)
	if property_info['acquisition_value'] <= 500000:
	tax_amount = 0
	else:
	tax_amount = int(property_info['acquisition_value'] * (final_rate / 1000))

	return {
	'tax_amount': tax_amount,
	'base_rate': base_rate,
	'final_rate': final_rate,
	'heavy_tax_type': heavy_tax_type,
	'is_adjustment_area': is_adjustment_area,
	'total_housing_count': total_housing_count,
	'acquisition_value': property_info['acquisition_value']
	}

	class LLMProcessor:
	"""HyperCLOVA X 기반 LLM 처리 모듈"""

	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.tax_engine = TaxRuleEngine()
	self.is_initialized = False
	self.device = 'cpu'

	# 시스템 프롬프트
	self.system_prompt = """당신은 대한민국 지방세법 취득세 전문가입니다.

	주요 역할:
	1. 취득세 관련 질문에 정확하고 상세한 답변 제공
	2. 지방세법 제2장 취득세 규정 기준 해석
	3. 다주택 보유시 중과세 계산 및 설명
	4. 조정대상지역 여부에 따른 세율 차이 설명
	5. 주택수 산정 기준 (시행령 제28조의4) 적용

	답변 형식:
	- 해당 법령 조항 명시
	- 구체적인 계산 과정 설명
	- 절세 방안 제시 (합법적 범위 내)
	- 신고 기한 및 유의사항 안내

	전문적이고 친절한 톤으로 답변하세요."""

	def initialize_model(self, force_cpu=False):
	"""HyperCLOVA X 모델 초기화"""
	if not TRANSFORMERS_AVAILABLE:
	print("❌ Transformers 라이브러리를 설치해주세요: pip install transformers torch")
	return False

	if self.is_initialized:
	return True

	print("🔄 HyperCLOVA X 1.5B 모델 초기화 중...")

	try:
	# HuggingFace 토큰 확인
	hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACE_HUB_TOKEN')
	if not hf_token:
	print("⚠️ HuggingFace 토큰이 필요합니다")
	return False

	# 디바이스 설정
	if force_cpu or not torch.cuda.is_available():
	self.device = 'cpu'
	print("💻 CPU 모드로 실행")
	else:
	self.device = 'cuda'
	print(f"🔥 GPU 모드로 실행: {torch.cuda.get_device_name()}")

	model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-1.5B"

	# Config 로드
	config = LlamaConfig.from_pretrained(model_name, token=hf_token)

	# Tokenizer 로드
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	token=hf_token,
	legacy=False,
	add_eos_token=True,
	add_bos_token=True
	)

	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# 모델 로드
	if self.device == 'cuda':
	# GPU: 8bit 양자화
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	llm_int8_enable_fp32_cpu_offload=True,
	llm_int8_threshold=6.0
	)

	self.model = LlamaForCausalLM.from_pretrained(
	model_name,
	config=config,
	quantization_config=quantization_config,
	torch_dtype=torch.float16,
	device_map="auto",
	token=hf_token,
	low_cpu_mem_usage=True
	)
	else:
	# CPU: float32
	self.model = LlamaForCausalLM.from_pretrained(
	model_name,
	config=config,
	torch_dtype=torch.float32,
	token=hf_token,
	low_cpu_mem_usage=True
	)
	self.model = self.model.to('cpu')

	self.is_initialized = True
	print(f"✅ HyperCLOVA X 모델 초기화 완료 ({self.device})")

	return True

	except Exception as e:
	print(f"❌ 모델 초기화 실패: {e}")
	return False

	def extract_property_info(self, user_input):
	"""사용자 입력에서 부동산 정보 자동 추출"""
	property_info = {
	'property_type': '주택',
	'acquisition_type': '매매',
	'acquisition_value': None,
	'location': '',
	'housing_list': []
	}

	# 금액 추출 (다양한 단위 지원)
	amount_patterns = [
	(r'(\d+(?:\.\d+)?)억', 100000000),
	(r'(\d+(?:,\d+)?)만원', 10000),
	]

	for pattern, multiplier in amount_patterns:
	amounts = re.findall(pattern, user_input)
	if amounts:
	amount_str = amounts[0].replace(',', '')
	property_info['acquisition_value'] = int(float(amount_str) * multiplier)
	break

	# 지역 추출
	for area in self.tax_engine.adjustment_areas:
	area_name = area.replace('구', '')
	if area_name in user_input or area in user_input:
	property_info['location'] = f'서울특별시 {area}'
	break

	# 주택수 추출
	housing_patterns = [r'(\d+)주택', r'기존.?(\d+).?주택', r'(\d+).*?보유']
	for pattern in housing_patterns:
	matches = re.findall(pattern, user_input)
	if matches:
	existing_count = int(matches[0]) - 1
	for i in range(max(0, existing_count)):
	property_info['housing_list'].append({
	'id': f'existing_house_{i+1}',
	'type': '주택',
	'acquisition_type': '매매',
	'value': 500000000
	})
	break

	return property_info

	def format_tax_result(self, result, property_info):
	"""계산 결과를 사용자 친화적으로 포맷팅"""
	if not result:
	return "📋 정확한 계산을 위해 부동산 가격을 구체적으로 알려주시면 도움이 됩니다."

	output = f"""📋 취득세 계산 결과
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	🏠 취득가액: {result['acquisition_value']:,}원
	🏘️ 총 주택수: {result['total_housing_count']}주택
	📍 조정대상지역: {'예' if result['is_adjustment_area'] else '아니오'}

	💰 세율 정보
	• 기본세율: {result['base_rate']}‰ ({result['base_rate']/10:.1f}%)
	• 최종세율: {result['final_rate']}‰ ({result['final_rate']/10:.1f}%)

	💸 취득세액: {result['tax_amount']:,}원"""

	if result['heavy_tax_type']:
	output += f"\n⚠️ 중과세 적용: {result['heavy_tax_type']}"

	output += f"""\n\n📜 법령 근거
	• 지방세법 제11조 (부동산 취득세)
	• 지방세법 제13조 (중과세)
	• 지방세법 시행령 제28조의4 (주택수 산정)
	• 신고기한: 취득일로부터 60일 이내"""

	return output

	def generate_ai_response(self, user_input, rag_context="", max_length=300):
	"""AI 응답 생성 (RAG 컨텍스트 포함)"""
	if not self.is_initialized:
	print("⚠️ 모델이 초기화되지 않았습니다. 초기화를 시도합니다...")
	if not self.initialize_model():
	return "❌ AI 모델 초기화에 실패했습니다."

	try:
	# 1. 자동 계산
	property_info = self.extract_property_info(user_input)
	tax_result = None
	tax_summary = ""

	if property_info.get('acquisition_value'):
	property_info['acquisition_date'] = datetime.now().strftime('%Y-%m-%d')
	tax_result = self.tax_engine.calculate_comprehensive_tax(property_info)
	tax_summary = self.format_tax_result(tax_result, property_info)

	# 2. AI 답변 생성을 위한 프롬프트 구성
	context_parts = []

	if rag_context:
	context_parts.append(f"참고 자료:\n{rag_context}")

	if tax_summary:
	context_parts.append(f"자동 계산 결과:\n{tax_summary}")

	context_prompt = f"""{self.system_prompt}

	사용자 질문: {user_input}

	{chr(10).join(context_parts)}

	위 정보를 바탕으로 전문가로서 상세하고 이해하기 쉬운 설명을 제공해주세요:"""

	# 3. 토크나이징
	inputs = self.tokenizer(
	context_prompt,
	return_tensors="pt",
	max_length=1800,
	truncation=True
	).to(self.model.device)

	# 4. AI 응답 생성
	with torch.no_grad():
	outputs = self.model.generate(
	inputs.input_ids,
	attention_mask=inputs.attention_mask,
	max_new_tokens=max_length,
	do_sample=True,
	temperature=0.6,
	top_p=0.85,
	repetition_penalty=1.15,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id
	)

	# 5. 응답 디코딩
	generated_response = self.tokenizer.decode(
	outputs[0][inputs.input_ids.shape[1]:],
	skip_special_tokens=True
	).strip()

	# 6. 최종 응답 구성
	final_response = ""

	if tax_summary:
	final_response += f"{tax_summary}\n\n"

	final_response += f"""🤖 AI 전문가 상세 설명
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	{generated_response}

	---
	💡 추가 문의나 다른 상황에 대한 상담이 필요하시면 언제든 말씀해 주세요!"""

	return final_response

	except Exception as e:
	error_response = f"❌ AI 응답 생성 중 오류가 발생했습니다: {str(e)}\n\n"
	if tax_summary:
	return error_response + tax_summary
	return error_response + "기본적인 취득세 정보는 지방세법 제11조를 참고하세요."

	def process_with_rag(self, user_input, rag_documents):
	"""RAG 문서와 함께 처리"""
	# RAG 문서를 컨텍스트로 변환
	if rag_documents and len(rag_documents) > 0:
	rag_context = "\n\n".join([doc.get('content', '') for doc in rag_documents])
	else:
	rag_context = ""

	return self.generate_ai_response(user_input, rag_context)

	# 전역 인스턴스
	_llm_processor = None

	def get_llm_processor():
	"""LLM 프로세서 싱글턴 인스턴스 반환"""
	global _llm_processor
	if _llm_processor is None:
	_llm_processor = LLMProcessor()
	return _llm_processor

	def is_llm_available():
	"""LLM 시스템 사용 가능 여부 확인"""
	return TRANSFORMERS_AVAILABLE and torch.cuda.is_available()

	def process_with_llm(user_input, rag_documents=None):
	"""편의 함수: RAG 결과와 함께 LLM 처리"""
	processor = get_llm_processor()

	if rag_documents:
	return processor.process_with_rag(user_input, rag_documents)
	else:
	return processor.generate_ai_response(user_input)

	if __name__ == "__main__":
	# 테스트 코드
	print("🧪 LLM 프로세서 테스트")

	processor = LLMProcessor()

	# 초기화 테스트
	if processor.initialize_model(force_cpu=True):
	print("✅ 모델 초기화 성공")

	# 간단한 테스트
	test_input = "강남구 10억원 아파트 3주택자 취득세"
	response = processor.generate_ai_response(test_input)
	print(f"응답: {response[:100]}...")
	else:
	print("❌ 모델 초기화 실패")