In [None]:
import numpy as np

category_mapping = {'Fiction' : 'Fiction',
 'Juvenile Fiction' : "Children's Fiction",
 'Biography & Autobiography' : 'Nonfiction',
 'History' : 'Nonfiction',
 'Literary Criticism' : 'Nonfiction',
 'Philosophy' : 'Nonfiction',
 'Religion' : 'Nonfiction',
 'Comics & Graphic Novels' : 'Fiction',
 'Juvenile Nonfiction' : "Children's Nonfiction",
 'Science' : 'Nonfiction',
 'Poetry' : 'Fiction',
 }

In [None]:
import pandas as pd
books = pd.read_csv("books_cleaned.csv")

In [None]:
books['simple_categories'] = books['categories'].map(category_mapping)

In [None]:
books

In [None]:
books[~(books['simple_categories'].isna())]

In [None]:
!pip install hf_xet
from transformers import pipeline

fiction_categories = ['Fiction', 'Nonfiction']
pipe = pipeline("zero-shot-classification",model="facebook/bart-large-mnli", device="cuda")


In [None]:
!pip install --upgrade huggingface_hub


In [None]:
!pip install "huggingface_hub[hf_xet]"


In [None]:
!pip show huggingface_hub
!pip show hf_xet


In [None]:
from transformers import pipeline
import torch
import os

print("Loading model... (this may take a few minutes on first run)")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
 print(f"GPU device: {torch.cuda.get_device_name(0)}")

# CRITICAL: Add GPU support and optimization parameters
try:
 os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "120"

 pipe = pipeline(
 "zero-shot-classification",
 model="facebook/bart-large-mnli",
 device=0 if torch.cuda.is_available() else -1, # Use GPU if available
 batch_size=64, # Internal pipeline batch size
 max_length=512, # Truncate long texts
 truncation=True,
 use_auth_token=False,
 revision="main"
 )

 print("āœ… Model loaded successfully with GPU acceleration!" if torch.cuda.is_available() else "āœ… Model loaded (CPU mode)")

except Exception as e:
 print(f"Error with facebook/bart-large-mnli: {e}")
 print("\nšŸ”„ Trying alternative model...")

 try:
 pipe = pipeline(
 "zero-shot-classification",
 model="typeform/distilbert-base-uncased-mnli",
 device=0 if torch.cuda.is_available() else -1, # GPU support
 batch_size=64,
 max_length=512,
 truncation=True
 )

 print("āœ… Alternative model loaded successfully!")

 except Exception as e2:
 print(f"āŒ Error with alternative model: {e2}")
 print("Please check your internet connection and try again.")


In [None]:
sequence = books.loc[books["simple_categories"] == 'Fiction', 'description'].reset_index(drop=True)[0]

In [None]:
pipe(sequence, fiction_categories)

In [None]:
max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]
max_label

In [None]:
from tqdm import tqdm
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp

# SOLUTION 1: Batch Processing (Most Important!)
def generate_predictions(sequences, categories, batch_size=32):
 """Process multiple sequences at once - much faster!"""
 predictions = []

 for i in tqdm(range(0, len(sequences), batch_size), desc="Processing batches"):
 batch = sequences[i:i+batch_size]

 # Process entire batch at once
 batch_results = pipe(batch, categories)

 # Handle both single result and list of results
 if isinstance(batch_results, list):
 predictions.extend([result['labels'][0] for result in batch_results])
 else:
 predictions.append(batch_results['labels'][0])

 return predictions

In [None]:
# Get 300 nonfiction descriptions
nonfiction_books = books.loc[books["simple_categories"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]

# Truncate for speed
sequences = [desc[:400] for desc in nonfiction_books]

# Process in batches of 20 (instead of 300 individual calls)
batch_size = 20
for i in tqdm(range(0, len(sequences), batch_size)):
 batch = sequences[i:i+batch_size]

 # One model call for 20 books instead of 20 separate calls
 results = pipe(batch, fiction_categories)

 # Extract predictions
 if isinstance(results, list):
 preddicted_cats += [r['labels'][0] for r in results]
 else:
 preddicted_cats += [results['labels'][0]]

 actual_cats += ['Nonfiction'] * len(batch)

In [None]:
predicted_df = pd.DataFrame({"actual_categories": actual_cats, "predicted_categories": preddicted_cats})

In [None]:
predicted_df.head()

In [None]:
predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)
 )

In [None]:
predicted_df['correct_prediction'].sum()/len(predicted_df)

In [None]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)

In [None]:
# Your current code (already run - don't re-run!)
sequences = [str(desc)[:200] if pd.notna(desc) else "" for desc in missing_cats["description"]]
sequences = [seq for seq in sequences if seq.strip()] # This changed the length!
isbns = missing_cats["isbn13"].tolist()
predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)




In [None]:
# FIX: Get the correct ISBNs that match your filtered sequences
descriptions = missing_cats["description"].tolist()
isbns_full = missing_cats["isbn13"].tolist()

matching_isbns = []
for i, desc in enumerate(descriptions):
 processed_desc = str(desc)[:200] if pd.notna(desc) else ""
 if processed_desc.strip(): # Same condition as your filter
 matching_isbns.append(isbns_full[i])

# Now create DataFrame with matching lengths
missing_predicted_df = pd.DataFrame({
 "isbn13": matching_isbns[:len(predicted_cats)], # Safety check
 "predicted_categories": predicted_cats
})

print(f"āœ… DataFrame created successfully with {len(missing_predicted_df)} rows")
print(f"šŸ“Š Predictions by category:")
print(missing_predicted_df['predicted_categories'].value_counts())

# Save results
missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)
print("šŸ’¾ Results saved to missing_categories_predictions.csv")

In [None]:
missing_predicted_df.head()

In [None]:
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns="predicted_categories")

In [None]:
books

In [None]:
books[books["categories"].str.lower().isin([
 "romance",
 "science fiction",
 "scifi",
 "fantasy",
 "horror",
 "mystery",
 "thriller",
 "comedy",
 "crime",
 "historical"
])]

In [None]:
books.to_csv("books_with_categories.csv", index=False)