Spaces:

srivishwaa55
/

course-recommendation-api

Sleeping

App Files Files Community

course-recommendation-api / server.py

srivishwaa55

Upload server.py with huggingface_hub

44ea9ef verified about 2 months ago

raw

history blame contribute delete

4.9 kB

	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from pathlib import Path
	from contextlib import asynccontextmanager
	from fastapi import FastAPI, Query
	from fastapi.middleware.cors import CORSMiddleware
	import uvicorn
	from typing import List, Dict, Any

	# ----------------------------
	# Paths & Setup
	# ----------------------------
	BASE_DIR = Path(__file__).resolve().parent
	EMBEDDINGS_PATH = BASE_DIR / "embeddings" / "course_embeddings.npy"
	METADATA_PATH = BASE_DIR / "embeddings" / "course_metadata.csv"

	# Global variables for caching
	model = None
	course_embeddings = None
	df = None

	def load_resources():
	global model, course_embeddings, df
	print("Loading model and data...")
	# Load Model
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# Load Data
	if not EMBEDDINGS_PATH.exists() or not METADATA_PATH.exists():
	raise FileNotFoundError(f"Data files not found. Please ensure {EMBEDDINGS_PATH} and {METADATA_PATH} exist.")

	course_embeddings = np.load(EMBEDDINGS_PATH)
	df = pd.read_csv(METADATA_PATH)
	print("Resources loaded successfully.")

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	load_resources()
	yield

	# ----------------------------
	# Init App
	# ----------------------------
	app = FastAPI(title="Course Recommendation API", lifespan=lifespan)

	# Allow CORS for React Frontend
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Allow all origins for dev; specify localhost:5173 in prod
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ----------------------------
	# API Endpoints
	# ----------------------------

	@app.get("/")
	async def root():
	return {"message": "Course Recommendation API is running!", "docs_url": "/docs"}

	@app.get("/search")
	async def search(q: str = Query(..., min_length=1)):
	"""
	Semantic search for courses based on query 'q'.
	Replicates the logic from the Streamlit app.
	"""
	global model, course_embeddings, df

	if model is None or course_embeddings is None or df is None:
	return {"error": "Server is not ready. Resources not loaded."}

	# 1. Encode query
	query_embedding = model.encode(q, normalize_embeddings=True)

	# 2. Calculate Cosine Similarity
	# course_embeddings should be shape (N, D), query is (D,)
	# Result is (N,)
	similarities = np.dot(course_embeddings, query_embedding)

	# Create a working copy of dataframe to avoid modifying global state continuously (though pandas is usually copy-on-write or we just make a view)
	# We'll create a new DataFrame with the scores for this request
	res_df = df.copy()
	res_df["similarity"] = similarities

	# Smart Filter: Check for "free" in query
	# If the user explicitly asks for "free", strictly filter for unpaid courses.
	if "free" in q.lower().split():
	print("Smart Filter: 'free' detected. Filtering for unpaid courses.")
	# Ensure 'is_paid' is treated as boolean. If read from CSV without specific dtype, it might be correct,
	# but robust code handles nuances.
	res_df = res_df[res_df["is_paid"] == False]

	# 3. Quality signals (Pre-calculation could be done once, but we follow app.py flow)
	res_df["rating_norm"] = res_df["rating"] / 5.0
	res_df["reviews_norm"] = np.log1p(res_df["num_reviews"])

	# 4. Filter First: Keep only top 50 semantically relevant courses
	# This removes popular but irrelevant courses from the running
	res_df = res_df.sort_values(by="similarity", ascending=False).head(50)

	# 5. Final Score Logic (Revised to prioritize Relevance)
	# 90% Similarity, 10% Quality Signals (just as a tie-breaker for relevant courses)
	res_df["final_score"] = (
	0.90 * res_df["similarity"] +
	0.05 * res_df["rating_norm"] +
	0.05 * res_df["reviews_norm"]
	)

	# 6. Top 6 results
	top_results = res_df.sort_values(by="final_score", ascending=False).head(6)

	# Debug: Check if embeddings are working
	print(f"\n--- Search Logic Debug ---")
	print(f"Query: '{q}'")
	print("Top results raw similarity vs final score:")
	print(top_results[["title", "similarity", "final_score"]].to_string(index=False))
	print("--------------------------\n")

	# 6. Format response
	response = []
	for _, row in top_results.iterrows():
	response.append({
	"title": row["title"],
	"rating": float(row["rating"]),
	"num_reviews": int(row["num_reviews"]),
	"is_paid": bool(row["is_paid"]),
	"url": row["url"]
	})

	return response

	if __name__ == "__main__":
	uvicorn.run("server:app", host="127.0.0.1", port=8000, reload=True)