Emeritus-21's picture
Update app.py
9076af8 verified
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from rapidfuzz import process
import numpy as np
import pandas as pd
import pickle
import os
import json
import random
# App setup
app = FastAPI(
title="πŸ“š OAU Book Recommender API",
description="Text-based hybrid recommender (SentenceTransformer + TF-IDF + KNN)",
version="1.0.0"
)
# Helper functions
def safe_int(value, default=0):
try:
return int(value)
except:
return default
def safe_str(value, default="Unknown"):
if pd.isna(value) or str(value).strip() == "":
return default
return str(value)
def get_image_url(row):
url = str(row.get("Image URL", "")).strip()
if not url or "BK.png" in url or url.lower() == "unknown":
return "https://via.placeholder.com/150?text=No+Cover"
return url
# Load models and data
BASE_DIR = "book-recommender_model"
SENTENCE_MODEL_PATH = os.path.join(BASE_DIR, "sentence_model")
data = pd.read_csv(os.path.join(BASE_DIR, "books.csv"))
# πŸ”§ Replace zero or missing Pages with a random value between 50 and 300
data['Pages'] = pd.to_numeric(data['Pages'], errors='coerce').fillna(0).astype(int)
data['Pages'] = data['Pages'].apply(lambda x: random.randint(50, 300) if x == 0 else x)
# πŸ”§ Replace zero or missing Year with a random value between 1970 and 2022
data['Year'] = pd.to_numeric(data['Year'], errors='coerce').fillna(0).astype(int)
data['Year'] = data['Year'].apply(lambda x: random.randint(1970, 2022) if x == 0 else x)
# Load model files
hybrid_features = np.load(os.path.join(BASE_DIR, "hybrid_features.npy"))
with open(os.path.join(BASE_DIR, "tfidf_vectorizer.pkl"), "rb") as f:
tfidf_vectorizer = pickle.load(f)
with open(os.path.join(BASE_DIR, "scaler.pkl"), "rb") as f:
scaler = pickle.load(f)
with open(os.path.join(BASE_DIR, "knn_model.pkl"), "rb") as f:
knn = pickle.load(f)
model = SentenceTransformer(SENTENCE_MODEL_PATH)
# Add extra fields
data['Description'] = data.apply(
lambda row: f"{safe_str(row['Title'])} by {safe_str(row['Author'])} is a {safe_str(row['Literary Form'])} book published in {safe_str(row['Year'])} by {safe_str(row['Publisher'])}. Format: {safe_str(row['Format'])}.",
axis=1
)
# Schema
class RecommendationResponse(BaseModel):
title: str
author: str
pages: int
year: int
genre: str
rating: float
link: str
image_url: str
description: str
# Routes
@app.get("/")
def home():
return {"msg": "πŸ“š OAU Book Recommender API is running!"}
@app.get("/health")
def health_check():
return {"status": "healthy", "books": len(data)}
@app.get("/ratings")
def get_rating_stats():
return {
"mean_rating": float(data['rating'].mean()),
"rating_distribution": data['rating'].value_counts().to_dict()
}
@app.get("/metrics")
def get_metrics():
try:
with open("book_recommender_metrics.json", "r") as f:
return json.load(f)
except FileNotFoundError:
return {"error": "Metrics file not found."}
# πŸš€ ONLY text-based recommendation endpoint
@app.get("/recommend/query/", response_model=list[RecommendationResponse])
def recommend_by_text(
query: str = Query(..., min_length=2),
top_n: int = 5,
min_pages: int = 0,
max_pages: int = 5000,
genre: str = "",
min_rating: float = 0.0
):
query_tfidf = tfidf_vectorizer.transform([query])
query_embed = model.encode([query])
query_pages = scaler.transform([[0]]) # Could be enhanced to use actual query metadata
query_vector = np.hstack((
query_tfidf.toarray(),
query_pages,
query_embed
))
distances, indices = knn.kneighbors(query_vector, n_neighbors=top_n * 2)
results = []
for idx in indices[0]:
row = data.loc[idx]
if not (min_pages <= safe_int(row['Pages']) <= max_pages):
continue
if genre and genre.lower() not in str(row.get("Literary Form", "")).lower():
continue
if float(row['rating']) < min_rating:
continue
results.append({
"title": row['Title'],
"author": row['Author'],
"pages": safe_int(row['Pages']),
"year": safe_int(row['Year']),
"genre": safe_str(row['Literary Form']),
"rating": float(row['rating']),
"link": row.get('Link', '#'),
"image_url": get_image_url(row),
"description": row['Description']
})
return results[:top_n]