File size: 6,889 Bytes

cc78280
348bcce
 
cc78280
348bcce
128be27
348bcce
cc78280
 
 
128be27
cc78280
 
 
 
 
 
 
 
 
128be27
 
 
 
 
 
 
 
 
cc78280
128be27
 
 
 
cc78280
128be27
 
 
 
cc78280
128be27
cc78280
348bcce
cc78280
348bcce
cc78280
348bcce
 
 
cc78280
 
348bcce
cc78280
 
348bcce
 
cc78280
348bcce
cc78280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348bcce
 
cc78280
348bcce
cc78280
 
348bcce
cc78280
 
 
 
 
 
348bcce
cc78280
 
 
 
 
 
 
 
 
 
 
 
 
348bcce
cc78280
 
 
 
 
348bcce
 
cc78280
 
 
348bcce
 
 
cc78280

import os, json, random, numpy as np, torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader
import sentencepiece as spm
import requests

# ===============================
# 0️⃣ 환경 설정
# ===============================
TOKENIZER_PATH = "ko_unigram.model"
DATA_PATH = "corpus.txt"
MAX_LEN = 128
EMBED_DIM = 384
LATENT_DIM = 384
BATCH_SIZE = 384
NEGATIVE_RATIO = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===============================
# 1️⃣ 파일 다운로드
# ===============================
def download_file(url, save_path):
    r = requests.get(url, stream=True)
    r.raise_for_status()
    with open(save_path, "wb") as f:
        for chunk in r.iter_content(8192*2):
            f.write(chunk)
    print(f"Saved {save_path}")

if not os.path.exists(TOKENIZER_PATH):
    download_file(
        "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
        TOKENIZER_PATH,
    )
if not os.path.exists(DATA_PATH):
    download_file(
        "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
        DATA_PATH,
    )

# ===============================
# 2️⃣ 토크나이저 준비
# ===============================
sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
vocab_size = sp.get_piece_size()

def encode_sentence(sentence, max_len=MAX_LEN):
    return sp.encode(sentence, out_type=int)[:max_len]

def pad_sentence(tokens):
    return tokens + [pad_id] * (MAX_LEN - len(tokens))

# ===============================
# 3️⃣ Streaming Dataset
# ===============================
class PairStream(IterableDataset):
    def __init__(self, txt_path, negative_ratio):
        self.sentences = [line.strip() for line in open(txt_path, encoding="utf-8") if line.strip()]
        self.neg_ratio = negative_ratio

    def __iter__(self):
        while True:
            for s1 in self.sentences:
                x1 = pad_sentence(encode_sentence(s1))
                yield (torch.tensor(x1), torch.tensor(x1), torch.tensor(1.0))
                for _ in range(self.neg_ratio):
                    s2 = random.choice(self.sentences)
                    x2 = pad_sentence(encode_sentence(s2))
                    yield (torch.tensor(x1), torch.tensor(x2), torch.tensor(0.0))

stream_ds = PairStream(DATA_PATH, NEGATIVE_RATIO)
loader = DataLoader(stream_ds, batch_size=BATCH_SIZE)

# ===============================
# 4️⃣ Sentence Encoder 정의
# ===============================
class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, latent_dim):
        super().__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True)
        self.WB = nn.Linear(embed_dim, embed_dim * 3)
        self.W = nn.Linear(embed_dim * 3 // 2, embed_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ln3 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x1 = self.ln1(x)
        attn, _ = self.mha(x1, x1, x1)
        x = attn + x
        x2 = self.ln2(x)
        w = self.WB(x2)
        a, b = torch.chunk(w, 2, dim=-1)
        g = F.silu(a) * b
        out = self.W(g)
        return self.ln3(out) + x

class SentenceEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, latent_dim, max_len):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.pos = nn.Embedding(max_len, embed_dim)
        self.blocks = nn.ModuleList([EncoderBlock(embed_dim, latent_dim) for _ in range(2)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.latent = nn.Linear(embed_dim, latent_dim)

    def forward(self, x):
        b, l = x.shape
        pos_ids = torch.arange(l, device=x.device).unsqueeze(0).expand(b, l)
        x = self.embed(x) + self.pos(pos_ids)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        x = x.mean(dim=1)
        return torch.tanh(self.latent(x))

encoder = SentenceEncoder(vocab_size, EMBED_DIM, LATENT_DIM, MAX_LEN).to(device)

# ===============================
# 5️⃣ Cosine + Contrastive Loss
# ===============================
def cosine_sim(v1, v2, eps=1e-8):
    dot = (v1 * v2).sum(dim=-1)
    norm = v1.norm(dim=-1) * v2.norm(dim=-1) + eps
    return dot / norm

def contrastive_loss(pred, label, margin=0.7):
    dist = 1 - pred
    pos_loss = label * dist.pow(2)
    neg_loss = (1 - label) * (torch.clamp(margin - dist, min=0).pow(2))
    return (pos_loss + neg_loss).mean()

optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-5)


encoder = torch.compile(encoder)
cosine_sim = torch.compile(cosine_sim)
contrastive_loss = torch.compile(contrastive_loss)
# ===============================
# 6️⃣ 학습 루프
# ===============================
steps_per_epoch = 23119910 // BATCH_SIZE

from tqdm import tqdm

encoder.train()

progress = tqdm(range(steps_per_epoch), desc="Training", ncols=120)

for step, batch in zip(progress, loader):
    x1, x2, y = [b.to(device) for b in batch]

    # forward
    v1 = encoder(x1)
    v2 = encoder(x2)
    pred = cosine_sim(v1, v2)

    loss = contrastive_loss(pred, y)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 📉 tqdm에 loss 표시
    progress.set_postfix({"loss": f"{loss.item():.4f}"})

# ===============================
# 7️⃣ 검색용 벡터 생성
# ===============================
LIMIT = 4000
prompts = []
for i, line in enumerate(open(DATA_PATH, "r", encoding="utf-8")):
    if i >= LIMIT: break
    line = line.strip()
    if line:
        prompts.append(line)

@torch.no_grad()
def get_sentence_vector(sentence):
    tokens = pad_sentence(encode_sentence(sentence))
    x = torch.tensor([tokens]).to(device)
    return encoder(x).cpu().numpy()[0]

if os.path.exists("corpus_vectors.npy"):
    corpus_vectors = np.load("corpus_vectors.npy")
else:
    corpus_vectors = np.stack([get_sentence_vector(p) for p in prompts]).astype(np.float16)
    np.save("corpus_vectors.npy", corpus_vectors)

corpus_norms = np.linalg.norm(corpus_vectors, axis=1)

# ===============================
# 8️⃣ 검색 함수
# ===============================
def search(query, top_k=3):
    q_vec = get_sentence_vector(query).astype(np.float16)
    sims = corpus_vectors @ q_vec
    sims /= (corpus_norms * np.linalg.norm(q_vec) + 1e-8)
    top_idx = np.argsort(sims)[::-1][:top_k]
    return [(prompts[i], float(sims[i])) for i in top_idx]


# ===============================
# 🔟 테스트
# ===============================
query = "점심이나 저녁을 우리와 함께 먹을 건가요?"
results = search(query)
for p, s in results:
    print(f"Prompt: {p}\n유사도: {s:.3f}\n---")