Spaces:

uddi12
/

Xtract

Sleeping

App Files Files Community

Xtract / app.py

uddi12

Upload 4 files

ebe2bca verified 4 months ago

raw

history blame contribute delete

3.87 kB

	from fastapi import FastAPI, HTTPException, Query
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	import pandas as pd
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	import os
	import gdown

	app = FastAPI(title="Research Paper Recommendation API")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Allow Next.js frontend
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	print("🔄 Loading dataset and FAISS index...")
	def download_from_drive(file_id, output):
	if not os.path.exists(output):
	print(f"📥 Downloading {output} from Google Drive...")
	url = f"https://drive.google.com/uc?id={file_id}"
	gdown.download(url, output, quiet=False)
	else:
	print(f"✅ {output} already exists, skipping download.")

	download_from_drive("1ME6Bb5WjVbIYr4-0iF0kUa35-82DTsUn", "papers_with_embeddings.csv")
	download_from_drive("1IwOkhBu-odM2GYmvZdS1Q5AHkf0pDV02", "embeddings.npy")
	download_from_drive("1MEB_4ZGunbxi65jW9UN0L8bnp9VUCH8s", "papers_index.faiss")

	try:
	df = pd.read_csv("papers_with_embeddings.csv")
	embeddings = np.load("embeddings.npy")
	index = faiss.read_index("papers_index.faiss")
	except Exception as e:
	raise RuntimeError(f" Failed to load files: {e}")

	model = SentenceTransformer("allenai/specter2_base")

	print(f" Loaded {len(df)} papers and FAISS index with {index.ntotal} vectors.")

	class Paper(BaseModel):
	id: str
	title: str
	authors: str
	update_date: str
	abstract: str \| None = None
	category_code: str \| None = None

	def get_recommendations(paper_id: str, top_k: int = 6):
	paper = df[df["id"].astype(str) == str(paper_id)]
	if paper.empty:
	raise HTTPException(status_code=404, detail="Paper not found")

	text = f"{paper.iloc[0]['title']}. {paper.iloc[0]['abstract']}"
	query_vec = model.encode([text], normalize_embeddings=True)

	D, I = index.search(query_vec, top_k + 1)
	recs = df.iloc[I[0]].copy()
	recs["similarity"] = D[0]
	# Exclude the query paper itself
	recs = recs[recs["id"].astype(str) != str(paper_id)]

	return recs[["id", "title", "authors", "update_date", "abstract", "similarity"]].head(top_k).to_dict(orient="records")


	def search_papers(query_text: str, top_k: int = 50):
	if not query_text.strip():
	raise HTTPException(status_code=400, detail="Query cannot be empty.")

	query_vec = model.encode([query_text], normalize_embeddings=True)
	D, I = index.search(query_vec, top_k)

	recs = df.iloc[I[0]].copy()
	recs["similarity"] = D[0]
	# Include abstract in output
	return recs[["id", "title", "authors", "update_date", "abstract", "similarity"]].to_dict(orient="records")


	@app.get("/")
	def root():
	return {"message": "SPECTER + FAISS Recommendation API is running 🚀"}

	@app.get("/paper/{paper_id}")
	def get_paper(paper_id: str):
	paper = df[df["id"].astype(str) == str(paper_id)]
	if paper.empty:
	raise HTTPException(status_code=404, detail="Paper not found")
	return paper.iloc[0].to_dict()

	@app.get("/recommend/{paper_id}")
	def recommend_papers(paper_id: str, top_k: int = 6):
	try:
	recs = get_recommendations(paper_id, top_k)
	return recs
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Recommendation error: {e}")

	@app.get("/search")
	def search_endpoint(query: str = Query(..., description="Search text query"), top_k: int = 50):
	"""
	Search for semantically similar papers using SPECTER embeddings.
	Example: /search?query=graph neural networks
	"""
	try:
	results = search_papers(query, top_k)
	return results
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Search error: {e}")