kost / preprocess.py
noranisa's picture
Create preprocess.py
05fbe77 verified
# preprocess.py
import json
import sqlite3
import numpy as np
from sentence_transformers import SentenceTransformer
# Muat model dari Hugging Face. Model ini akan di-download saat pertama kali dijalankan.
print("Loading sentence-transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")
# Fungsi untuk membuat database dan tabel
def setup_database():
conn = sqlite3.connect('kost.db')
cursor = conn.cursor()
# Hapus tabel jika sudah ada untuk memulai dari awal
cursor.execute('DROP TABLE IF EXISTS kost')
cursor.execute('''
CREATE TABLE kost (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
price TEXT,
location TEXT,
url TEXT,
image_url TEXT,
embedding BLOB
)
''')
conn.commit()
conn.close()
# Fungsi untuk memasukkan data ke database
def insert_data(data):
conn = sqlite3.connect('kost.db')
cursor = conn.cursor()
# Gabungkan teks dari title dan location untuk embedding yang lebih kaya
texts_to_embed = [f"{item.get('title', '')}. Lokasi: {item.get('location', '')}" for item in data]
print(f"Generating embeddings for {len(texts_to_embed)} items...")
embeddings = model.encode(texts_to_embed, show_progress_bar=True)
print("Embeddings generated.")
for item, embedding in zip(data, embeddings):
cursor.execute(
'INSERT INTO kost (title, price, location, url, image_url, embedding) VALUES (?, ?, ?, ?, ?, ?)',
(
item.get('title'),
item.get('price'),
item.get('location'),
item.get('url'),
item.get('imageUrl'),
embedding.tobytes() # Simpan numpy array sebagai BLOB
)
)
conn.commit()
conn.close()
if __name__ == '__main__':
setup_database()
with open('data/fb_marketplace_data.json', 'r', encoding='utf-8') as f:
kost_data = json.load(f)
# Filter item yang tidak punya judul
kost_data_filtered = [item for item in kost_data if item.get('title')]
insert_data(kost_data_filtered)
print("Database 'kost.db' has been created and populated successfully.")