Spaces:
Sleeping
Sleeping
| # step3_encode_dataset_hybrid.py | |
| import json | |
| import os | |
| import numpy as np | |
| import pickle | |
| from sentence_transformers import SentenceTransformer | |
| from rank_bm25 import BM25Okapi | |
| from underthesea import word_tokenize | |
| import re | |
| MODEL_NAME = "keepitreal/vietnamese-sbert" | |
| INPUT_JSON = "dataset/knowledge_base.json" | |
| OUT_DIR = "artifacts" | |
| VECTORS_NPY = os.path.join(OUT_DIR, "kb_vectors.npy") | |
| META_JSON = os.path.join(OUT_DIR, "kb_meta.json") | |
| BM25_PKL = os.path.join(OUT_DIR, "bm25_index.pkl") | |
| TOKENIZED_PKL = os.path.join(OUT_DIR, "tokenized_corpus.pkl") | |
| def preprocess_vietnamese_text(text: str) -> str: | |
| """Chuẩn hóa text tiếng Việt""" | |
| if not text: | |
| return "" | |
| text = text.lower() | |
| # Giữ dấu tiếng Việt | |
| text = re.sub(r'[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', ' ', text) | |
| text = ' '.join(text.split()) | |
| return text | |
| def extract_keywords(item: dict) -> list: | |
| keywords = [] | |
| # Prioritize topic/chapter as exact phrases | |
| topic = item.get("topic", "").lower() | |
| if topic: | |
| keywords.append(topic) # Keep full topic as one keyword | |
| chapter = item.get("chapter", "").lower() | |
| if chapter: | |
| keywords.append(chapter) | |
| # Add important bi-grams from content | |
| content = item.get("content_for_embedding", "").lower() | |
| if content: | |
| words = content.split() | |
| # Add 2-word phrases | |
| for i in range(len(words)-1): | |
| keywords.append(f"{words[i]} {words[i+1]}") | |
| return keywords[:30] # Limit to avoid noise | |
| def extract_text_for_embedding(item: dict) -> str: | |
| """Trích xuất text cho embedding""" | |
| texts = [] | |
| topic = item.get("topic", "").strip() | |
| if topic: | |
| texts.append(f"Chủ đề: {topic}") | |
| content = item.get("content_for_embedding", "").strip() | |
| if content: | |
| texts.append(content) | |
| metadata = item.get("metadata", {}) | |
| if isinstance(metadata, dict): | |
| raw_text = metadata.get("raw_text", "").strip() | |
| if raw_text and raw_text != content: | |
| texts.append(raw_text) | |
| chapter = item.get("chapter", "").strip() | |
| if chapter: | |
| texts.append(f"Thuộc: {chapter}") | |
| combined = ". ".join(texts) | |
| return preprocess_vietnamese_text(combined) | |
| def main(): | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| # Load dataset | |
| with open(INPUT_JSON, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| print(f"📊 Processing {len(data)} items...") | |
| # Extract texts and metadata | |
| texts = [] | |
| meta = [] | |
| all_keywords = [] | |
| for idx, item in enumerate(data): | |
| item_id = item.get("id", f"idx_{idx}") | |
| text = extract_text_for_embedding(item) | |
| keywords = extract_keywords(item) | |
| if not text or len(text) < 10: | |
| print(f"⚠️ Warning: Item {item_id} has insufficient text") | |
| continue | |
| texts.append(text) | |
| all_keywords.append(keywords) | |
| meta.append({ | |
| "index": len(texts) - 1, | |
| "id": item_id, | |
| "topic": item.get("topic", ""), | |
| "chapter": item.get("chapter", ""), | |
| "knowledge_type": item.get("metadata", {}).get("knowledge_type", ""), | |
| "keywords": keywords, | |
| "text_length": len(text) | |
| }) | |
| print(f"📏 Avg text length: {np.mean([m['text_length'] for m in meta]):.0f} chars") | |
| print(f"🔑 Avg keywords: {np.mean([len(k) for k in all_keywords]):.1f} per item") | |
| # ===== 1. Semantic Embeddings ===== | |
| print(f"\n🤖 Loading model: {MODEL_NAME}") | |
| model = SentenceTransformer(MODEL_NAME) | |
| print("🔄 Encoding semantic vectors...") | |
| vectors = model.encode( | |
| texts, | |
| batch_size=32, | |
| show_progress_bar=True, | |
| normalize_embeddings=True, | |
| convert_to_numpy=True | |
| ) | |
| vectors = np.asarray(vectors, dtype=np.float32) | |
| # ===== 2. BM25 Index ===== | |
| print("\n📝 Building BM25 index...") | |
| tokenized_corpus = [] | |
| for text in texts: | |
| try: | |
| # Tokenize tiếng Việt | |
| tokens = word_tokenize(text, format="text").split() | |
| except: | |
| # Fallback: simple split | |
| tokens = text.split() | |
| tokenized_corpus.append(tokens) | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| # ===== 3. Save Everything ===== | |
| print("\n💾 Saving artifacts...") | |
| np.save(VECTORS_NPY, vectors) | |
| with open(META_JSON, "w", encoding="utf-8") as f: | |
| json.dump(meta, f, ensure_ascii=False, indent=2) | |
| with open(BM25_PKL, "wb") as f: | |
| pickle.dump(bm25, f) | |
| with open(TOKENIZED_PKL, "wb") as f: | |
| pickle.dump(tokenized_corpus, f) | |
| print("\n✅ Step 3 DONE (Hybrid)") | |
| print(f"📦 Items: {len(texts)}") | |
| print(f"📐 Vector shape: {vectors.shape}") | |
| print(f"💾 Saved:") | |
| print(f" - {VECTORS_NPY}") | |
| print(f" - {META_JSON}") | |
| print(f" - {BM25_PKL}") | |
| print(f" - {TOKENIZED_PKL}") | |
| if __name__ == "__main__": | |
| main() |