# preprocess.py import json import sqlite3 import numpy as np from sentence_transformers import SentenceTransformer # Muat model dari Hugging Face. Model ini akan di-download saat pertama kali dijalankan. print("Loading sentence-transformer model...") model = SentenceTransformer('all-MiniLM-L6-v2') print("Model loaded.") # Fungsi untuk membuat database dan tabel def setup_database(): conn = sqlite3.connect('kost.db') cursor = conn.cursor() # Hapus tabel jika sudah ada untuk memulai dari awal cursor.execute('DROP TABLE IF EXISTS kost') cursor.execute(''' CREATE TABLE kost ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, price TEXT, location TEXT, url TEXT, image_url TEXT, embedding BLOB ) ''') conn.commit() conn.close() # Fungsi untuk memasukkan data ke database def insert_data(data): conn = sqlite3.connect('kost.db') cursor = conn.cursor() # Gabungkan teks dari title dan location untuk embedding yang lebih kaya texts_to_embed = [f"{item.get('title', '')}. Lokasi: {item.get('location', '')}" for item in data] print(f"Generating embeddings for {len(texts_to_embed)} items...") embeddings = model.encode(texts_to_embed, show_progress_bar=True) print("Embeddings generated.") for item, embedding in zip(data, embeddings): cursor.execute( 'INSERT INTO kost (title, price, location, url, image_url, embedding) VALUES (?, ?, ?, ?, ?, ?)', ( item.get('title'), item.get('price'), item.get('location'), item.get('url'), item.get('imageUrl'), embedding.tobytes() # Simpan numpy array sebagai BLOB ) ) conn.commit() conn.close() if __name__ == '__main__': setup_database() with open('data/fb_marketplace_data.json', 'r', encoding='utf-8') as f: kost_data = json.load(f) # Filter item yang tidak punya judul kost_data_filtered = [item for item in kost_data if item.get('title')] insert_data(kost_data_filtered) print("Database 'kost.db' has been created and populated successfully.")