Spaces:

gracephamit
/

g8-cs106

Sleeping

File size: 5,345 Bytes

0dd9600

# step3_encode_dataset_hybrid.py
import json
import os
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from underthesea import word_tokenize
import re

MODEL_NAME = "keepitreal/vietnamese-sbert"

INPUT_JSON = "dataset/knowledge_base.json"
OUT_DIR = "artifacts"
VECTORS_NPY = os.path.join(OUT_DIR, "kb_vectors.npy")
META_JSON = os.path.join(OUT_DIR, "kb_meta.json")
BM25_PKL = os.path.join(OUT_DIR, "bm25_index.pkl")
TOKENIZED_PKL = os.path.join(OUT_DIR, "tokenized_corpus.pkl")


def preprocess_vietnamese_text(text: str) -> str:
    """Chuẩn hóa text tiếng Việt"""
    if not text:
        return ""
    text = text.lower()
    # Giữ dấu tiếng Việt
    text = re.sub(r'[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', ' ', text)
    text = ' '.join(text.split())
    return text


def extract_keywords(item: dict) -> list:
    keywords = []
    
    # Prioritize topic/chapter as exact phrases
    topic = item.get("topic", "").lower()
    if topic:
        keywords.append(topic)  # Keep full topic as one keyword
    
    chapter = item.get("chapter", "").lower()
    if chapter:
        keywords.append(chapter)
    
    # Add important bi-grams from content
    content = item.get("content_for_embedding", "").lower()
    if content:
        words = content.split()
        # Add 2-word phrases
        for i in range(len(words)-1):
            keywords.append(f"{words[i]} {words[i+1]}")
    
    return keywords[:30]  # Limit to avoid noise


def extract_text_for_embedding(item: dict) -> str:
    """Trích xuất text cho embedding"""
    texts = []
    
    topic = item.get("topic", "").strip()
    if topic:
        texts.append(f"Chủ đề: {topic}")
    
    content = item.get("content_for_embedding", "").strip()
    if content:
        texts.append(content)
    
    metadata = item.get("metadata", {})
    if isinstance(metadata, dict):
        raw_text = metadata.get("raw_text", "").strip()
        if raw_text and raw_text != content:
            texts.append(raw_text)
    
    chapter = item.get("chapter", "").strip()
    if chapter:
        texts.append(f"Thuộc: {chapter}")
    
    combined = ". ".join(texts)
    return preprocess_vietnamese_text(combined)


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    # Load dataset
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"📊 Processing {len(data)} items...")

    # Extract texts and metadata
    texts = []
    meta = []
    all_keywords = []
    
    for idx, item in enumerate(data):
        item_id = item.get("id", f"idx_{idx}")
        text = extract_text_for_embedding(item)
        keywords = extract_keywords(item)
        
        if not text or len(text) < 10:
            print(f"⚠️ Warning: Item {item_id} has insufficient text")
            continue
        
        texts.append(text)
        all_keywords.append(keywords)
        meta.append({
            "index": len(texts) - 1,
            "id": item_id,
            "topic": item.get("topic", ""),
            "chapter": item.get("chapter", ""),
            "knowledge_type": item.get("metadata", {}).get("knowledge_type", ""),
            "keywords": keywords,
            "text_length": len(text)
        })

    print(f"📏 Avg text length: {np.mean([m['text_length'] for m in meta]):.0f} chars")
    print(f"🔑 Avg keywords: {np.mean([len(k) for k in all_keywords]):.1f} per item")

    # ===== 1. Semantic Embeddings =====
    print(f"\n🤖 Loading model: {MODEL_NAME}")
    model = SentenceTransformer(MODEL_NAME)
    
    print("🔄 Encoding semantic vectors...")
    vectors = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        normalize_embeddings=True,
        convert_to_numpy=True
    )
    vectors = np.asarray(vectors, dtype=np.float32)
    
    # ===== 2. BM25 Index =====
    print("\n📝 Building BM25 index...")
    tokenized_corpus = []
    
    for text in texts:
        try:
            # Tokenize tiếng Việt
            tokens = word_tokenize(text, format="text").split()
        except:
            # Fallback: simple split
            tokens = text.split()
        tokenized_corpus.append(tokens)
    
    bm25 = BM25Okapi(tokenized_corpus)
    
    # ===== 3. Save Everything =====
    print("\n💾 Saving artifacts...")
    
    np.save(VECTORS_NPY, vectors)
    
    with open(META_JSON, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)
    
    with open(BM25_PKL, "wb") as f:
        pickle.dump(bm25, f)
    
    with open(TOKENIZED_PKL, "wb") as f:
        pickle.dump(tokenized_corpus, f)

    print("\n✅ Step 3 DONE (Hybrid)")
    print(f"📦 Items: {len(texts)}")
    print(f"📐 Vector shape: {vectors.shape}")
    print(f"💾 Saved:")
    print(f"   - {VECTORS_NPY}")
    print(f"   - {META_JSON}")
    print(f"   - {BM25_PKL}")
    print(f"   - {TOKENIZED_PKL}")


if __name__ == "__main__":
    main()