|
|
"""
|
|
|
B2NL-IntelligentTokenizer v6.2.1 - 실제 작동하는 추론 코드
|
|
|
이 파일이 메인 사용법입니다.
|
|
|
"""
|
|
|
|
|
|
import torch
|
|
|
import sys
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1"))
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1/core"))
|
|
|
|
|
|
from core.unified_model import IntelligentTokenizerV62
|
|
|
from core.tokenizer import ByteTokenizerV62
|
|
|
|
|
|
|
|
|
class B2NLTokenizer:
|
|
|
"""실제로 작동하는 B2NL 토크나이저"""
|
|
|
|
|
|
def __init__(self, checkpoint_path: str = None):
|
|
|
"""
|
|
|
Args:
|
|
|
checkpoint_path: 체크포인트 경로 (없으면 기본값 사용)
|
|
|
"""
|
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
|
|
|
|
if checkpoint_path is None:
|
|
|
checkpoint_path = "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
|
|
|
|
|
|
|
|
|
self.model = IntelligentTokenizerV62()
|
|
|
checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
|
|
|
self.model.load_state_dict(checkpoint['model_state_dict'])
|
|
|
self.model = self.model.to(self.device)
|
|
|
self.model.eval()
|
|
|
|
|
|
print(f"Model loaded successfully on {self.device}")
|
|
|
|
|
|
def compress(self, text: str) -> dict:
|
|
|
"""텍스트를 압축"""
|
|
|
return self.model.compress(text)
|
|
|
|
|
|
def reconstruct(self, text: str, temperature: float = 0.1) -> str:
|
|
|
"""
|
|
|
텍스트를 압축 후 복원 (실제 작동하는 버전)
|
|
|
|
|
|
Args:
|
|
|
text: 입력 텍스트
|
|
|
temperature: 생성 온도 (낮을수록 결정적)
|
|
|
|
|
|
Returns:
|
|
|
복원된 텍스트
|
|
|
"""
|
|
|
|
|
|
tokenizer = self.model.tokenizer
|
|
|
encoded = tokenizer.encode(text)
|
|
|
|
|
|
if isinstance(encoded, dict):
|
|
|
input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
|
|
|
attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
|
|
|
else:
|
|
|
input_ids = encoded.unsqueeze(0) if encoded.dim() == 1 else encoded
|
|
|
attention_mask = torch.ones_like(input_ids)
|
|
|
|
|
|
input_ids = input_ids.to(self.device)
|
|
|
attention_mask = attention_mask.to(self.device)
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
encoder_outputs = self.model.encoder(
|
|
|
input_ids=input_ids,
|
|
|
attention_mask=attention_mask
|
|
|
)
|
|
|
|
|
|
|
|
|
if 'all_hidden_states' in encoder_outputs:
|
|
|
encoder_all_hidden = encoder_outputs['all_hidden_states']
|
|
|
else:
|
|
|
compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
|
|
|
encoder_all_hidden = [compressed] * 4
|
|
|
|
|
|
|
|
|
batch_size = input_ids.size(0)
|
|
|
max_length = 48
|
|
|
|
|
|
|
|
|
generated = torch.full((batch_size, 1), tokenizer.BOS, device=self.device)
|
|
|
|
|
|
for step in range(max_length - 1):
|
|
|
with torch.no_grad():
|
|
|
|
|
|
decoder_outputs = self.model.decoder(
|
|
|
encoder_all_hidden=encoder_all_hidden,
|
|
|
decoder_input_ids=generated,
|
|
|
attention_mask=torch.ones_like(generated),
|
|
|
use_cache=False
|
|
|
)
|
|
|
|
|
|
|
|
|
logits = decoder_outputs['logits'][:, -1, :] / temperature
|
|
|
|
|
|
|
|
|
top_k = 10
|
|
|
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
|
|
logits[indices_to_remove] = float('-inf')
|
|
|
|
|
|
|
|
|
probs = torch.nn.functional.softmax(logits, dim=-1)
|
|
|
next_token = torch.multinomial(probs, num_samples=1)
|
|
|
|
|
|
|
|
|
generated = torch.cat([generated, next_token], dim=1)
|
|
|
|
|
|
|
|
|
if (next_token == tokenizer.EOS).all():
|
|
|
break
|
|
|
|
|
|
|
|
|
if generated.dim() > 1:
|
|
|
text = tokenizer.decode(generated[0])
|
|
|
else:
|
|
|
text = tokenizer.decode(generated)
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
def test_tokenizer():
|
|
|
"""토크나이저 테스트"""
|
|
|
print("="*60)
|
|
|
print("B2NL-IntelligentTokenizer v6.2.1 테스트")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
tokenizer = B2NLTokenizer()
|
|
|
|
|
|
|
|
|
test_texts = [
|
|
|
"Hello, world!",
|
|
|
"안녕하세요, 반갑습니다.",
|
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
|
"人工智能技术正在改变世界。",
|
|
|
]
|
|
|
|
|
|
for text in test_texts:
|
|
|
print(f"\n원본: {text}")
|
|
|
|
|
|
|
|
|
compressed = tokenizer.compress(text)
|
|
|
print(f"압축률: {compressed['compression_ratio']:.1f}:1 ({compressed['num_tokens']} 토큰)")
|
|
|
|
|
|
|
|
|
reconstructed = tokenizer.reconstruct(text, temperature=0.1)
|
|
|
print(f"복원: {reconstructed}")
|
|
|
|
|
|
|
|
|
min_len = min(len(text), len(reconstructed))
|
|
|
accuracy = sum(1 for i in range(min_len) if text[i] == reconstructed[i]) / len(text) * 100
|
|
|
print(f"정확도: {accuracy:.1f}%")
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("Test completed!")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
def example_usage():
|
|
|
"""간단한 사용 예제"""
|
|
|
|
|
|
tokenizer = B2NLTokenizer()
|
|
|
|
|
|
|
|
|
text = "안녕하세요, 반갑습니다!"
|
|
|
compressed = tokenizer.compress(text)
|
|
|
print(f"압축 결과: {compressed['compression_ratio']:.1f}:1")
|
|
|
|
|
|
|
|
|
reconstructed = tokenizer.reconstruct(text)
|
|
|
print(f"복원 결과: {reconstructed}")
|
|
|
|
|
|
return tokenizer
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
test_tokenizer() |