Spaces:
Runtime error
Runtime error
File size: 4,895 Bytes
44ea9ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI, Query
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from typing import List, Dict, Any
# ----------------------------
# Paths & Setup
# ----------------------------
BASE_DIR = Path(__file__).resolve().parent
EMBEDDINGS_PATH = BASE_DIR / "embeddings" / "course_embeddings.npy"
METADATA_PATH = BASE_DIR / "embeddings" / "course_metadata.csv"
# Global variables for caching
model = None
course_embeddings = None
df = None
def load_resources():
global model, course_embeddings, df
print("Loading model and data...")
# Load Model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Load Data
if not EMBEDDINGS_PATH.exists() or not METADATA_PATH.exists():
raise FileNotFoundError(f"Data files not found. Please ensure {EMBEDDINGS_PATH} and {METADATA_PATH} exist.")
course_embeddings = np.load(EMBEDDINGS_PATH)
df = pd.read_csv(METADATA_PATH)
print("Resources loaded successfully.")
@asynccontextmanager
async def lifespan(app: FastAPI):
load_resources()
yield
# ----------------------------
# Init App
# ----------------------------
app = FastAPI(title="Course Recommendation API", lifespan=lifespan)
# Allow CORS for React Frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins for dev; specify localhost:5173 in prod
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ----------------------------
# API Endpoints
# ----------------------------
@app.get("/")
async def root():
return {"message": "Course Recommendation API is running!", "docs_url": "/docs"}
@app.get("/search")
async def search(q: str = Query(..., min_length=1)):
"""
Semantic search for courses based on query 'q'.
Replicates the logic from the Streamlit app.
"""
global model, course_embeddings, df
if model is None or course_embeddings is None or df is None:
return {"error": "Server is not ready. Resources not loaded."}
# 1. Encode query
query_embedding = model.encode(q, normalize_embeddings=True)
# 2. Calculate Cosine Similarity
# course_embeddings should be shape (N, D), query is (D,)
# Result is (N,)
similarities = np.dot(course_embeddings, query_embedding)
# Create a working copy of dataframe to avoid modifying global state continuously (though pandas is usually copy-on-write or we just make a view)
# We'll create a new DataFrame with the scores for this request
res_df = df.copy()
res_df["similarity"] = similarities
# Smart Filter: Check for "free" in query
# If the user explicitly asks for "free", strictly filter for unpaid courses.
if "free" in q.lower().split():
print("Smart Filter: 'free' detected. Filtering for unpaid courses.")
# Ensure 'is_paid' is treated as boolean. If read from CSV without specific dtype, it might be correct,
# but robust code handles nuances.
res_df = res_df[res_df["is_paid"] == False]
# 3. Quality signals (Pre-calculation could be done once, but we follow app.py flow)
res_df["rating_norm"] = res_df["rating"] / 5.0
res_df["reviews_norm"] = np.log1p(res_df["num_reviews"])
# 4. Filter First: Keep only top 50 semantically relevant courses
# This removes popular but irrelevant courses from the running
res_df = res_df.sort_values(by="similarity", ascending=False).head(50)
# 5. Final Score Logic (Revised to prioritize Relevance)
# 90% Similarity, 10% Quality Signals (just as a tie-breaker for relevant courses)
res_df["final_score"] = (
0.90 * res_df["similarity"] +
0.05 * res_df["rating_norm"] +
0.05 * res_df["reviews_norm"]
)
# 6. Top 6 results
top_results = res_df.sort_values(by="final_score", ascending=False).head(6)
# Debug: Check if embeddings are working
print(f"\n--- Search Logic Debug ---")
print(f"Query: '{q}'")
print("Top results raw similarity vs final score:")
print(top_results[["title", "similarity", "final_score"]].to_string(index=False))
print("--------------------------\n")
# 6. Format response
response = []
for _, row in top_results.iterrows():
response.append({
"title": row["title"],
"rating": float(row["rating"]),
"num_reviews": int(row["num_reviews"]),
"is_paid": bool(row["is_paid"]),
"url": row["url"]
})
return response
if __name__ == "__main__":
uvicorn.run("server:app", host="127.0.0.1", port=8000, reload=True)
|