Spaces:
Sleeping
Sleeping
File size: 2,551 Bytes
653865f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import pickle
import sqlite3
import logging
from pathlib import Path
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def migrate_itemcf(pkl_path, db_path):
pkl_path = Path(pkl_path)
db_path = Path(db_path)
if not pkl_path.exists():
logger.error(f"Pickle not found at {pkl_path}")
return
logger.info(f"Loading massive pickle {pkl_path} (1.4GB)... This may take a minute.")
with open(pkl_path, 'rb') as f:
data = pickle.load(f)
sim_matrix = data.get('sim_matrix', {})
user_hist = data.get('user_hist', {})
logger.info(f"Connecting to SQLite {db_path}...")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Create tables
cursor.execute("DROP TABLE IF EXISTS item_similarity")
cursor.execute("""
CREATE TABLE item_similarity (
item1 TEXT,
item2 TEXT,
score REAL
)
""")
cursor.execute("DROP TABLE IF EXISTS user_history")
cursor.execute("""
CREATE TABLE user_history (
user_id TEXT,
isbn TEXT
)
""")
# Insert Item Similarity
logger.info("Inserting item similarity data...")
batch = []
for item1, related in tqdm(sim_matrix.items(), desc="ItemCF Similarity"):
for item2, score in related.items():
batch.append((item1, item2, score))
if len(batch) >= 100000:
cursor.executemany("INSERT INTO item_similarity VALUES (?, ?, ?)", batch)
batch = []
if batch:
cursor.executemany("INSERT INTO item_similarity VALUES (?, ?, ?)", batch)
# Insert User History
logger.info("Inserting user history data...")
batch = []
for user_id, isbns in tqdm(user_hist.items(), desc="User History"):
for isbn in isbns:
batch.append((user_id, isbn))
if len(batch) >= 100000:
cursor.executemany("INSERT INTO user_history VALUES (?, ?)", batch)
batch = []
if batch:
cursor.executemany("INSERT INTO user_history VALUES (?, ?)", batch)
# Create Indices
logger.info("Creating indices...")
cursor.execute("CREATE INDEX idx_item1 ON item_similarity(item1)")
cursor.execute("CREATE INDEX idx_user ON user_history(user_id)")
conn.commit()
conn.close()
logger.info("Migration complete.")
if __name__ == "__main__":
migrate_itemcf("data/model/recall/itemcf.pkl", "data/recall_models.db")
|