book-rec-with-LLMs / scripts /migrate_recall_to_sqlite.py
ymlin105's picture
feat: implement zero-RAM SQLite architecture for Hugging Face deployment
653865f
import pickle
import sqlite3
import logging
from pathlib import Path
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def migrate_itemcf(pkl_path, db_path):
pkl_path = Path(pkl_path)
db_path = Path(db_path)
if not pkl_path.exists():
logger.error(f"Pickle not found at {pkl_path}")
return
logger.info(f"Loading massive pickle {pkl_path} (1.4GB)... This may take a minute.")
with open(pkl_path, 'rb') as f:
data = pickle.load(f)
sim_matrix = data.get('sim_matrix', {})
user_hist = data.get('user_hist', {})
logger.info(f"Connecting to SQLite {db_path}...")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Create tables
cursor.execute("DROP TABLE IF EXISTS item_similarity")
cursor.execute("""
CREATE TABLE item_similarity (
item1 TEXT,
item2 TEXT,
score REAL
)
""")
cursor.execute("DROP TABLE IF EXISTS user_history")
cursor.execute("""
CREATE TABLE user_history (
user_id TEXT,
isbn TEXT
)
""")
# Insert Item Similarity
logger.info("Inserting item similarity data...")
batch = []
for item1, related in tqdm(sim_matrix.items(), desc="ItemCF Similarity"):
for item2, score in related.items():
batch.append((item1, item2, score))
if len(batch) >= 100000:
cursor.executemany("INSERT INTO item_similarity VALUES (?, ?, ?)", batch)
batch = []
if batch:
cursor.executemany("INSERT INTO item_similarity VALUES (?, ?, ?)", batch)
# Insert User History
logger.info("Inserting user history data...")
batch = []
for user_id, isbns in tqdm(user_hist.items(), desc="User History"):
for isbn in isbns:
batch.append((user_id, isbn))
if len(batch) >= 100000:
cursor.executemany("INSERT INTO user_history VALUES (?, ?)", batch)
batch = []
if batch:
cursor.executemany("INSERT INTO user_history VALUES (?, ?)", batch)
# Create Indices
logger.info("Creating indices...")
cursor.execute("CREATE INDEX idx_item1 ON item_similarity(item1)")
cursor.execute("CREATE INDEX idx_user ON user_history(user_id)")
conn.commit()
conn.close()
logger.info("Migration complete.")
if __name__ == "__main__":
migrate_itemcf("data/model/recall/itemcf.pkl", "data/recall_models.db")