Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException, Query | |
| from pydantic import BaseModel | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.neighbors import NearestNeighbors | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import MinMaxScaler | |
| from rapidfuzz import process | |
| import numpy as np | |
| import pandas as pd | |
| import pickle | |
| import os | |
| import json | |
| import random | |
| # App setup | |
| app = FastAPI( | |
| title="π OAU Book Recommender API", | |
| description="Text-based hybrid recommender (SentenceTransformer + TF-IDF + KNN)", | |
| version="1.0.0" | |
| ) | |
| # Helper functions | |
| def safe_int(value, default=0): | |
| try: | |
| return int(value) | |
| except: | |
| return default | |
| def safe_str(value, default="Unknown"): | |
| if pd.isna(value) or str(value).strip() == "": | |
| return default | |
| return str(value) | |
| def get_image_url(row): | |
| url = str(row.get("Image URL", "")).strip() | |
| if not url or "BK.png" in url or url.lower() == "unknown": | |
| return "https://via.placeholder.com/150?text=No+Cover" | |
| return url | |
| # Load models and data | |
| BASE_DIR = "book-recommender_model" | |
| SENTENCE_MODEL_PATH = os.path.join(BASE_DIR, "sentence_model") | |
| data = pd.read_csv(os.path.join(BASE_DIR, "books.csv")) | |
| # π§ Replace zero or missing Pages with a random value between 50 and 300 | |
| data['Pages'] = pd.to_numeric(data['Pages'], errors='coerce').fillna(0).astype(int) | |
| data['Pages'] = data['Pages'].apply(lambda x: random.randint(50, 300) if x == 0 else x) | |
| # π§ Replace zero or missing Year with a random value between 1970 and 2022 | |
| data['Year'] = pd.to_numeric(data['Year'], errors='coerce').fillna(0).astype(int) | |
| data['Year'] = data['Year'].apply(lambda x: random.randint(1970, 2022) if x == 0 else x) | |
| # Load model files | |
| hybrid_features = np.load(os.path.join(BASE_DIR, "hybrid_features.npy")) | |
| with open(os.path.join(BASE_DIR, "tfidf_vectorizer.pkl"), "rb") as f: | |
| tfidf_vectorizer = pickle.load(f) | |
| with open(os.path.join(BASE_DIR, "scaler.pkl"), "rb") as f: | |
| scaler = pickle.load(f) | |
| with open(os.path.join(BASE_DIR, "knn_model.pkl"), "rb") as f: | |
| knn = pickle.load(f) | |
| model = SentenceTransformer(SENTENCE_MODEL_PATH) | |
| # Add extra fields | |
| data['Description'] = data.apply( | |
| lambda row: f"{safe_str(row['Title'])} by {safe_str(row['Author'])} is a {safe_str(row['Literary Form'])} book published in {safe_str(row['Year'])} by {safe_str(row['Publisher'])}. Format: {safe_str(row['Format'])}.", | |
| axis=1 | |
| ) | |
| # Schema | |
| class RecommendationResponse(BaseModel): | |
| title: str | |
| author: str | |
| pages: int | |
| year: int | |
| genre: str | |
| rating: float | |
| link: str | |
| image_url: str | |
| description: str | |
| # Routes | |
| def home(): | |
| return {"msg": "π OAU Book Recommender API is running!"} | |
| def health_check(): | |
| return {"status": "healthy", "books": len(data)} | |
| def get_rating_stats(): | |
| return { | |
| "mean_rating": float(data['rating'].mean()), | |
| "rating_distribution": data['rating'].value_counts().to_dict() | |
| } | |
| def get_metrics(): | |
| try: | |
| with open("book_recommender_metrics.json", "r") as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| return {"error": "Metrics file not found."} | |
| # π ONLY text-based recommendation endpoint | |
| def recommend_by_text( | |
| query: str = Query(..., min_length=2), | |
| top_n: int = 5, | |
| min_pages: int = 0, | |
| max_pages: int = 5000, | |
| genre: str = "", | |
| min_rating: float = 0.0 | |
| ): | |
| query_tfidf = tfidf_vectorizer.transform([query]) | |
| query_embed = model.encode([query]) | |
| query_pages = scaler.transform([[0]]) # Could be enhanced to use actual query metadata | |
| query_vector = np.hstack(( | |
| query_tfidf.toarray(), | |
| query_pages, | |
| query_embed | |
| )) | |
| distances, indices = knn.kneighbors(query_vector, n_neighbors=top_n * 2) | |
| results = [] | |
| for idx in indices[0]: | |
| row = data.loc[idx] | |
| if not (min_pages <= safe_int(row['Pages']) <= max_pages): | |
| continue | |
| if genre and genre.lower() not in str(row.get("Literary Form", "")).lower(): | |
| continue | |
| if float(row['rating']) < min_rating: | |
| continue | |
| results.append({ | |
| "title": row['Title'], | |
| "author": row['Author'], | |
| "pages": safe_int(row['Pages']), | |
| "year": safe_int(row['Year']), | |
| "genre": safe_str(row['Literary Form']), | |
| "rating": float(row['rating']), | |
| "link": row.get('Link', '#'), | |
| "image_url": get_image_url(row), | |
| "description": row['Description'] | |
| }) | |
| return results[:top_n] | |