uddi12 commited on
Commit
ebe2bca
·
verified ·
1 Parent(s): 26f74e4

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +8 -0
  3. app.py +115 -0
  4. papers_index.faiss +3 -0
  5. requirements.txt +7 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ papers_index.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /app
4
+ COPY . /app
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ EXPOSE 7860
8
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ import pandas as pd
5
+ import numpy as np
6
+ import faiss
7
+ from sentence_transformers import SentenceTransformer
8
+ import os
9
+ import gdown
10
+
11
+ app = FastAPI(title="Research Paper Recommendation API")
12
+
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"], # Allow Next.js frontend
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ print("🔄 Loading dataset and FAISS index...")
22
+ def download_from_drive(file_id, output):
23
+ if not os.path.exists(output):
24
+ print(f"📥 Downloading {output} from Google Drive...")
25
+ url = f"https://drive.google.com/uc?id={file_id}"
26
+ gdown.download(url, output, quiet=False)
27
+ else:
28
+ print(f"✅ {output} already exists, skipping download.")
29
+
30
+ download_from_drive("1ME6Bb5WjVbIYr4-0iF0kUa35-82DTsUn", "papers_with_embeddings.csv")
31
+ download_from_drive("1IwOkhBu-odM2GYmvZdS1Q5AHkf0pDV02", "embeddings.npy")
32
+ download_from_drive("1MEB_4ZGunbxi65jW9UN0L8bnp9VUCH8s", "papers_index.faiss")
33
+
34
+ try:
35
+ df = pd.read_csv("papers_with_embeddings.csv")
36
+ embeddings = np.load("embeddings.npy")
37
+ index = faiss.read_index("papers_index.faiss")
38
+ except Exception as e:
39
+ raise RuntimeError(f" Failed to load files: {e}")
40
+
41
+ model = SentenceTransformer("allenai/specter2_base")
42
+
43
+ print(f" Loaded {len(df)} papers and FAISS index with {index.ntotal} vectors.")
44
+
45
+ class Paper(BaseModel):
46
+ id: str
47
+ title: str
48
+ authors: str
49
+ update_date: str
50
+ abstract: str | None = None
51
+ category_code: str | None = None
52
+
53
+ def get_recommendations(paper_id: str, top_k: int = 6):
54
+ paper = df[df["id"].astype(str) == str(paper_id)]
55
+ if paper.empty:
56
+ raise HTTPException(status_code=404, detail="Paper not found")
57
+
58
+ text = f"{paper.iloc[0]['title']}. {paper.iloc[0]['abstract']}"
59
+ query_vec = model.encode([text], normalize_embeddings=True)
60
+
61
+ D, I = index.search(query_vec, top_k + 1)
62
+ recs = df.iloc[I[0]].copy()
63
+ recs["similarity"] = D[0]
64
+ # Exclude the query paper itself
65
+ recs = recs[recs["id"].astype(str) != str(paper_id)]
66
+
67
+ return recs[["id", "title", "authors", "update_date", "abstract", "similarity"]].head(top_k).to_dict(orient="records")
68
+
69
+
70
+ def search_papers(query_text: str, top_k: int = 50):
71
+ if not query_text.strip():
72
+ raise HTTPException(status_code=400, detail="Query cannot be empty.")
73
+
74
+ query_vec = model.encode([query_text], normalize_embeddings=True)
75
+ D, I = index.search(query_vec, top_k)
76
+
77
+ recs = df.iloc[I[0]].copy()
78
+ recs["similarity"] = D[0]
79
+ # Include abstract in output
80
+ return recs[["id", "title", "authors", "update_date", "abstract", "similarity"]].to_dict(orient="records")
81
+
82
+
83
+ @app.get("/")
84
+ def root():
85
+ return {"message": "SPECTER + FAISS Recommendation API is running 🚀"}
86
+
87
+ @app.get("/paper/{paper_id}")
88
+ def get_paper(paper_id: str):
89
+ paper = df[df["id"].astype(str) == str(paper_id)]
90
+ if paper.empty:
91
+ raise HTTPException(status_code=404, detail="Paper not found")
92
+ return paper.iloc[0].to_dict()
93
+
94
+ @app.get("/recommend/{paper_id}")
95
+ def recommend_papers(paper_id: str, top_k: int = 6):
96
+ try:
97
+ recs = get_recommendations(paper_id, top_k)
98
+ return recs
99
+ except HTTPException:
100
+ raise
101
+ except Exception as e:
102
+ raise HTTPException(status_code=500, detail=f"Recommendation error: {e}")
103
+
104
+ @app.get("/search")
105
+ def search_endpoint(query: str = Query(..., description="Search text query"), top_k: int = 50):
106
+ """
107
+ Search for semantically similar papers using SPECTER embeddings.
108
+ Example: /search?query=graph neural networks
109
+ """
110
+ try:
111
+ results = search_papers(query, top_k)
112
+ return results
113
+ except Exception as e:
114
+ raise HTTPException(status_code=500, detail=f"Search error: {e}")
115
+
papers_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb48220b52b84c794b9409e70e4df599500f6a11f8d6e68c67630cfdb125455
3
+ size 596167725
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pandas
4
+ numpy
5
+ faiss-cpu
6
+ sentence-transformers
7
+ gdown