import pandas as pd from transformers import DistilBertTokenizer, DistilBertModel import torch import os # Define dtype for loading books.csv dtype_spec = { 'ISBN': str, 'Book-Title': str, 'Book-Author': str, 'Year-Of-Publication': str, 'Publisher': str, 'Image-URL-S': str, 'Image-URL-M': str, 'Image-URL-L': str } # Load the CSV file books_df = pd.read_csv("books.csv", encoding='latin1', delimiter=';', on_bad_lines='skip', dtype=dtype_spec) # Load pre-trained DistilBERT tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-uncased') # Function to get BERT embedding for a single text def get_bert_embedding(text): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): # Disable gradients for inference outputs = model(**inputs) # Use the [CLS] token's embedding (first token) cls_embedding = outputs.last_hidden_state[0][0].numpy() return cls_embedding # File names for output and progress output_file = "books_with_embeddings.csv" progress_file = "progress.txt" # Determine starting row index if os.path.exists(progress_file): with open(progress_file, "r") as pf: start_idx = int(pf.read().strip()) print(f"Resuming from row {start_idx}.") else: start_idx = 0 print("Starting from the beginning.") batch_size = 10 total_rows = len(books_df) # If starting from 0, we need to write the header; otherwise, we append. write_header = start_idx == 0 try: # Process the data in batches for idx in range(start_idx, total_rows, batch_size): # Select the batch batch_df = books_df.iloc[idx: idx + batch_size].copy() # Compute the embeddings for the "Book-Title" column batch_df['embedding'] = batch_df['Book-Title'].apply(lambda title: get_bert_embedding(title)) # Convert numpy arrays to lists for CSV storage batch_df['embedding'] = batch_df['embedding'].apply(lambda x: x.tolist()) # Write the current batch to the CSV file batch_df.to_csv(output_file, mode='a', header=write_header, index=False) # After the first batch, do not write header again write_header = False # Update progress (save the next starting row) next_idx = idx + batch_size with open(progress_file, "w") as pf: pf.write(str(next_idx)) print(f"Processed rows {idx} to {min(next_idx, total_rows)} out of {total_rows}.") except KeyboardInterrupt: print(f"Process interrupted at row {idx}. Progress saved in {progress_file}.")