book-rec-with-LLMs / scripts /init_sqlite_db.py
ymlin105's picture
feat: implement zero-RAM SQLite architecture for Hugging Face deployment
653865f
import sqlite3
import pandas as pd
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def init_sqlite_db(data_dir: str = "data"):
data_path = Path(data_dir)
db_path = data_path / "books.db"
processed_csv = data_path / "books_processed.csv"
basic_info_csv = data_path / "books_basic_info.csv"
if not processed_csv.exists():
logger.error(f"Source file {processed_csv} not found.")
return
logger.info("Loading CSV data into memory for migration...")
df_p = pd.read_csv(processed_csv)
# Normalize ISBN13 in processed df
df_p['isbn13'] = df_p['isbn13'].astype(str).str.strip().str.replace(".0", "", regex=False)
if basic_info_csv.exists():
logger.info(f"Merging with {basic_info_csv}...")
df_b = pd.read_csv(basic_info_csv)
df_b['isbn13'] = df_b['isbn13'].astype(str).str.strip().str.replace(".0", "", regex=False)
# Keep only unique info from basic_info
df_b = df_b[['isbn13', 'isbn10', 'image', 'publisher', 'publishedDate']].drop_duplicates(subset=['isbn13'])
df = pd.merge(df_p, df_b, on='isbn13', how='left')
else:
df = df_p
# Fill NaN
df = df.fillna("")
logger.info(f"Connecting to {db_path}...")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 1. Create main books table
logger.info("Creating 'books' table...")
df.to_sql('books', conn, if_exists='replace', index=False)
cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_isbn13 ON books (isbn13)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_isbn10 ON books (isbn10)")
# 2. Create FTS5 virtual table for keyword search
logger.info("Creating FTS5 virtual table 'books_fts'...")
cursor.execute("DROP TABLE IF EXISTS books_fts")
cursor.execute("""
CREATE VIRTUAL TABLE books_fts USING fts5(
isbn13 UNINDEXED,
title,
description,
authors,
simple_categories,
content='books',
tokenize='porter unicode61'
)
""")
# Populate FTS index
cursor.execute("""
INSERT INTO books_fts(isbn13, title, description, authors, simple_categories)
SELECT isbn13, title, description, authors, simple_categories FROM books
""")
conn.commit()
# Verify
cursor.execute("SELECT COUNT(*) FROM books")
count = cursor.fetchone()[0]
logger.info(f"Migration complete. {count} books imported into SQLite.")
conn.close()
if __name__ == "__main__":
init_sqlite_db()