siham47 commited on
Commit
ee87c07
Β·
1 Parent(s): bc981d1

initial deployment

Browse files
Files changed (5) hide show
  1. Dockerfile +26 -0
  2. README.md +20 -6
  3. api_server.py +455 -0
  4. download_data.py +36 -0
  5. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements
11
+ COPY requirements.txt .
12
+
13
+ # Install Python dependencies
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy application code
17
+ COPY api_server.py .
18
+
19
+ # Download data script
20
+ COPY download_data.py .
21
+
22
+ # Expose port
23
+ EXPOSE 7860
24
+
25
+ # Download data and start server
26
+ CMD python download_data.py && uvicorn api_server:app --host 0.0.0.0 --port 7860
README.md CHANGED
@@ -1,11 +1,25 @@
1
  ---
2
- title: Academic Recommender Api
3
- emoji: ⚑
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
8
- license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Academic Recommendation API
3
+ emoji: πŸ“š
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
+ # Academic Paper Recommendation API
11
+
12
+ LLM-powered recommendation system for academic papers using SPECTER2 embeddings.
13
+
14
+ ## API Endpoints
15
+
16
+ - `GET /` - Health check
17
+ - `POST /recommend` - Get paper recommendations
18
+
19
+ ## Usage
20
+
21
+ ```bash
22
+ curl -X POST "https://YOUR-USERNAME-academic-recommendation-api.hf.space/recommend" \
23
+ -H "Content-Type: application/json" \
24
+ -d '{"query":"quantum entanglement","top_k":10}'
25
+ ```
api_server.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Academic Recommendation API Server
3
+ Exposes the recommendation engine as a REST API for n8n integration.
4
+
5
+ Author: Siham Zaiad Al Kousa (U24200503)
6
+ Course: 1501531 Machine Learning
7
+ Date: December 2025
8
+ """
9
+
10
+ from fastapi import FastAPI, HTTPException
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from pydantic import BaseModel, Field
13
+ from typing import List, Optional, Dict, Any
14
+ import json
15
+ import numpy as np
16
+ import torch
17
+ from pathlib import Path
18
+ import uvicorn
19
+
20
+ # SPECTER2 imports
21
+ from transformers import AutoTokenizer
22
+ from adapters import AutoAdapterModel
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+
25
+ # ============================================================================
26
+ # CONFIGURATION
27
+ # ============================================================================
28
+
29
+ CONFIG = {
30
+ 'corpus_path': 'data_final/processed/corpus_with_embeddings.json',
31
+ 'embeddings_path': 'data_final/processed/embeddings.npy',
32
+ 'specter2_model': 'allenai/specter2_base',
33
+ 'specter2_adapter': 'allenai/specter2_adhoc_query',
34
+ 'device': 'cuda' if torch.cuda.is_available() else 'cpu',
35
+ 'default_top_k': 10,
36
+ 'max_top_k': 50,
37
+ }
38
+
39
+ # ============================================================================
40
+ # PYDANTIC MODELS (Request/Response schemas)
41
+ # ============================================================================
42
+
43
+ class RecommendationRequest(BaseModel):
44
+ """Request schema for recommendations."""
45
+ query: str = Field(..., description="Search query")
46
+ top_k: int = Field(default=10, ge=1, le=50, description="Number of recommendations")
47
+ filter_type: Optional[str] = Field(default=None, description="Filter by 'paper' or 'video'")
48
+ year_min: Optional[int] = Field(default=None, description="Minimum publication year")
49
+ year_max: Optional[int] = Field(default=None, description="Maximum publication year")
50
+ category: Optional[str] = Field(default=None, description="Filter by arXiv category")
51
+ min_citations: Optional[int] = Field(default=None, description="Minimum citation count")
52
+
53
+
54
+ class PaperMetadata(BaseModel):
55
+ """Metadata for a single paper."""
56
+ paper_id: str
57
+ title: str
58
+ authors: List[str]
59
+ abstract: str
60
+ published: str
61
+ citations: int
62
+ category: str
63
+ arxiv_id: Optional[str]
64
+ url: Optional[str]
65
+
66
+
67
+ class RecommendationItem(BaseModel):
68
+ """Single recommendation with scores."""
69
+ id: str
70
+ type: str
71
+ title: str
72
+ abstract: str
73
+ metadata: Dict[str, Any]
74
+ scores: Dict[str, float]
75
+ rank: int
76
+
77
+
78
+ class RecommendationResponse(BaseModel):
79
+ """Response schema for recommendations."""
80
+ query: str
81
+ total_results: int
82
+ recommendations: List[RecommendationItem]
83
+ execution_time_ms: float
84
+
85
+
86
+ # ============================================================================
87
+ # SPECTER2 ENCODER
88
+ # ============================================================================
89
+
90
+ class SPECTER2Encoder:
91
+ """SPECTER2 encoder with adhoc_query adapter for queries."""
92
+
93
+ def __init__(self, model_name: str, adapter_name: str, device: str):
94
+ self.device = torch.device(device)
95
+
96
+ print(f"Loading SPECTER2 model: {model_name}")
97
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
98
+ self.model = AutoAdapterModel.from_pretrained(model_name)
99
+
100
+ print(f"Loading adapter: {adapter_name}")
101
+ self.model.load_adapter(adapter_name, source='hf', set_active=True)
102
+
103
+ self.model.to(self.device)
104
+ self.model.eval()
105
+
106
+ print(f"βœ“ SPECTER2 ready on {self.device}")
107
+
108
+ def encode_query(self, query: str) -> np.ndarray:
109
+ """Encode query using adhoc_query adapter."""
110
+ inputs = self.tokenizer(
111
+ query,
112
+ padding=True,
113
+ truncation=True,
114
+ max_length=512,
115
+ return_tensors='pt'
116
+ ).to(self.device)
117
+
118
+ with torch.no_grad():
119
+ outputs = self.model(**inputs)
120
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
121
+
122
+ return embedding
123
+
124
+
125
+ # ============================================================================
126
+ # RECOMMENDATION ENGINE (Simplified)
127
+ # ============================================================================
128
+
129
+ class RecommendationEngine:
130
+ """Simplified recommendation engine for API."""
131
+
132
+ def __init__(self, corpus_path: str, embeddings_path: str, encoder: SPECTER2Encoder):
133
+ # Load corpus
134
+ print(f"Loading corpus from: {corpus_path}")
135
+ with open(corpus_path, 'r', encoding='utf-8') as f:
136
+ corpus_data = json.load(f)
137
+
138
+ # Extract items from the nested structure
139
+ self.corpus = corpus_data.get('items', [])
140
+ if not self.corpus:
141
+ print("⚠️ Warning: No items found in corpus!")
142
+
143
+ # Load embeddings
144
+ print(f"Loading embeddings from: {embeddings_path}")
145
+ self.embeddings = np.load(embeddings_path)
146
+
147
+ # Store additional metadata if needed
148
+ self.corpus_metadata = corpus_data.get('metadata', {})
149
+
150
+ self.encoder = encoder
151
+
152
+ print(f"βœ“ Loaded {len(self.corpus)} items")
153
+ print(f"βœ“ Embeddings shape: {self.embeddings.shape}")
154
+ print(f"βœ“ Corpus metadata keys: {list(self.corpus_metadata.keys())}")
155
+
156
+ # Recommend method with filtering
157
+ def recommend(self,
158
+ query: str,
159
+ top_k: int = 10,
160
+ filter_type: Optional[str] = None,
161
+ year_min: Optional[int] = None,
162
+ year_max: Optional[int] = None,
163
+ category: Optional[str] = None,
164
+ min_citations: Optional[int] = None) -> List[Dict]:
165
+ """
166
+ Generate recommendations with optional filters.
167
+
168
+ Returns list of items with scores.
169
+ """
170
+ # Encode query
171
+ query_embedding = self.encoder.encode_query(query)
172
+
173
+ # Compute similarities
174
+ similarities = cosine_similarity(
175
+ query_embedding.reshape(1, -1),
176
+ self.embeddings
177
+ )[0]
178
+
179
+ # Score and filter items
180
+ scored_items = []
181
+ for i, item in enumerate(self.corpus):
182
+ # Type filter
183
+ item_type = item.get('type', 'paper') # Default to paper
184
+ if filter_type and item_type != filter_type:
185
+ continue
186
+
187
+ # Get metadata from your structure
188
+ metadata = item.get('metadata', {})
189
+
190
+ # Year filter - check published date
191
+ if year_min or year_max:
192
+ pub_date = metadata.get('published', '')
193
+ if isinstance(pub_date, str):
194
+ # Try to extract year
195
+ import re
196
+ year_match = re.search(r'\d{4}', pub_date)
197
+ if year_match:
198
+ try:
199
+ year = int(year_match.group())
200
+ if year_min and year < year_min:
201
+ continue
202
+ if year_max and year > year_max:
203
+ continue
204
+ except (ValueError, TypeError):
205
+ pass
206
+
207
+ # Category filter - check your actual category field
208
+ if category:
209
+ # Try different possible category fields
210
+ item_cat = metadata.get('primary_category', '') or metadata.get('category', '')
211
+ if not isinstance(item_cat, str):
212
+ item_cat = str(item_cat)
213
+ if category.lower() not in item_cat.lower():
214
+ continue
215
+
216
+ # Citation filter
217
+ if min_citations:
218
+ citations = metadata.get('citationCount', 0) or metadata.get('citations', 0)
219
+ if not isinstance(citations, (int, float)):
220
+ citations = 0
221
+ if citations < min_citations:
222
+ continue
223
+
224
+ # Calculate scores
225
+ similarity = float(similarities[i])
226
+
227
+ # Get impact (citations)
228
+ impact = metadata.get('citationCount', 0) or metadata.get('citations', 0)
229
+ if not isinstance(impact, (int, float)):
230
+ impact = 0
231
+
232
+ # Get age from fetched_at or published date
233
+ age_months = 30.0 # Default
234
+ if 'fetched_at' in item:
235
+ # You might need to parse the fetched_at date
236
+ pass
237
+
238
+ # Simple recency score (exponential decay)
239
+ recency = np.exp(-age_months / 24.0) # Half-life = 24 months
240
+
241
+ # Weighted final score (60% sim, 20% impact normalized, 20% recency)
242
+ impact_normalized = min(impact / 500.0, 1.0) # Cap at 500 citations
243
+ final_score = 0.6 * similarity + 0.2 * impact_normalized + 0.2 * recency
244
+
245
+ # Build the response item based on your actual data structure
246
+ scored_items.append({
247
+ 'id': item.get('id', f'item_{i}'),
248
+ 'type': item_type,
249
+ 'title': item.get('title', 'Untitled'),
250
+ 'abstract': item.get('abstract', '')[:500] or item.get('abstract_cleaned', '')[:500],
251
+ 'metadata': {
252
+ 'authors': metadata.get('authors', []),
253
+ 'published': metadata.get('published', ''),
254
+ 'citationCount': impact,
255
+ 'primary_category': metadata.get('primary_category', '') or metadata.get('category', ''),
256
+ 'arxiv_id': item.get('arxiv_id', ''),
257
+ 'url': metadata.get('url', '') or metadata.get('pdf_url', ''),
258
+ },
259
+ 'scores': {
260
+ 'similarity': similarity,
261
+ 'impact': impact,
262
+ 'impact_normalized': impact_normalized,
263
+ 'recency': recency,
264
+ 'final_score': final_score,
265
+ },
266
+ })
267
+
268
+ # Sort by final score
269
+ scored_items.sort(key=lambda x: x['scores']['final_score'], reverse=True)
270
+
271
+ # Return top-K
272
+ results = scored_items[:top_k]
273
+
274
+ # Add rank
275
+ for rank, item in enumerate(results, 1):
276
+ item['rank'] = rank
277
+
278
+ return results
279
+
280
+ # ============================================================================
281
+ # FASTAPI APPLICATION
282
+ # ============================================================================
283
+
284
+ app = FastAPI(
285
+ title="Academic Recommendation API",
286
+ description="LLM-Powered recommendation system for academic papers and videos",
287
+ version="1.0.0"
288
+ )
289
+
290
+ # Enable CORS
291
+ app.add_middleware(
292
+ CORSMiddleware,
293
+ allow_origins=["*"],
294
+ allow_credentials=True,
295
+ allow_methods=["*"],
296
+ allow_headers=["*"],
297
+ )
298
+
299
+ # Global engine instance (loaded on startup)
300
+ engine = None
301
+
302
+
303
+ @app.on_event("startup")
304
+ async def startup_event():
305
+ """Load model and corpus on startup."""
306
+ global engine
307
+
308
+ print("="*70)
309
+ print("STARTING RECOMMENDATION API SERVER")
310
+ print("="*70)
311
+
312
+ try:
313
+ # Initialize SPECTER2 encoder
314
+ encoder = SPECTER2Encoder(
315
+ model_name=CONFIG['specter2_model'],
316
+ adapter_name=CONFIG['specter2_adapter'],
317
+ device=CONFIG['device']
318
+ )
319
+
320
+ # Initialize recommendation engine
321
+ engine = RecommendationEngine(
322
+ corpus_path=CONFIG['corpus_path'],
323
+ embeddings_path=CONFIG['embeddings_path'],
324
+ encoder=encoder
325
+ )
326
+
327
+ print("\nβœ… API Server Ready!")
328
+ print(f"Device: {CONFIG['device']}")
329
+ print(f"Corpus: {len(engine.corpus)} items")
330
+ print("="*70)
331
+
332
+ except Exception as e:
333
+ print(f"\n❌ ERROR during startup: {str(e)}")
334
+ raise
335
+
336
+
337
+ @app.get("/")
338
+ async def root():
339
+ """Health check endpoint."""
340
+ return {
341
+ "service": "Academic Recommendation API",
342
+ "status": "running",
343
+ "version": "1.0.0",
344
+ "corpus_size": len(engine.corpus) if engine else 0,
345
+ }
346
+
347
+
348
+ @app.get("/health")
349
+ async def health():
350
+ """Detailed health check."""
351
+ return {
352
+ "status": "healthy" if engine else "initializing",
353
+ "device": CONFIG['device'],
354
+ "model_loaded": engine is not None,
355
+ "corpus_loaded": len(engine.corpus) if engine else 0,
356
+ }
357
+
358
+
359
+ @app.post("/recommend", response_model=RecommendationResponse)
360
+ async def get_recommendations(request: RecommendationRequest):
361
+ """
362
+ Get paper/video recommendations for a query.
363
+
364
+ **Parameters:**
365
+ - query: Search query (required)
366
+ - top_k: Number of results (1-50, default 10)
367
+ - filter_type: Filter by 'paper' or 'video'
368
+ - year_min: Minimum publication year
369
+ - year_max: Maximum publication year
370
+ - category: Filter by arXiv category
371
+ - min_citations: Minimum citation count
372
+
373
+ **Returns:**
374
+ - Ranked list of recommendations with scores and metadata
375
+ """
376
+ if not engine:
377
+ raise HTTPException(status_code=503, detail="Engine not initialized")
378
+
379
+ try:
380
+ import time
381
+ start_time = time.time()
382
+
383
+ # Get recommendations
384
+ results = engine.recommend(
385
+ query=request.query,
386
+ top_k=request.top_k,
387
+ filter_type=request.filter_type,
388
+ year_min=request.year_min,
389
+ year_max=request.year_max,
390
+ category=request.category,
391
+ min_citations=request.min_citations,
392
+ )
393
+
394
+ # Calculate execution time
395
+ execution_time = (time.time() - start_time) * 1000 # Convert to ms
396
+
397
+ # Format response
398
+ response = RecommendationResponse(
399
+ query=request.query,
400
+ total_results=len(results),
401
+ recommendations=[
402
+ RecommendationItem(**item) for item in results
403
+ ],
404
+ execution_time_ms=round(execution_time, 2)
405
+ )
406
+
407
+ return response
408
+
409
+ except Exception as e:
410
+ raise HTTPException(status_code=500, detail=f"Recommendation failed: {str(e)}")
411
+
412
+
413
+ @app.get("/stats")
414
+ async def get_stats():
415
+ """Get corpus statistics."""
416
+ if not engine:
417
+ raise HTTPException(status_code=503, detail="Engine not initialized")
418
+
419
+ papers = [item for item in engine.corpus if item.get('type') == 'paper']
420
+ videos = [item for item in engine.corpus if item.get('type') == 'video']
421
+
422
+ # Category distribution
423
+ categories = {}
424
+ for paper in papers:
425
+ metadata = paper.get('metadata', {})
426
+ cat = metadata.get('primary_category', '') or metadata.get('category', 'unknown')
427
+ categories[cat] = categories.get(cat, 0) + 1
428
+
429
+ top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]
430
+
431
+ return {
432
+ "total_items": len(engine.corpus),
433
+ "papers": len(papers),
434
+ "videos": len(videos),
435
+ "top_categories": [{"category": cat, "count": count} for cat, count in top_categories],
436
+ "corpus_metadata": engine.corpus_metadata,
437
+ }
438
+
439
+
440
+
441
+ # ============================================================================
442
+ # MAIN
443
+ # ============================================================================
444
+
445
+ if __name__ == "__main__":
446
+ print("\nπŸš€ Starting API server...")
447
+ print("πŸ“ API docs will be available at: http://localhost:8000/docs")
448
+ print("πŸ”§ Health check: http://localhost:8000/health\n")
449
+
450
+ uvicorn.run(
451
+ app,
452
+ host="0.0.0.0",
453
+ port=8000,
454
+ log_level="info"
455
+ )
download_data.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Download corpus and embeddings from Google Drive
3
+ """
4
+ import gdown
5
+ import os
6
+ from pathlib import Path
7
+
8
+ def download_data():
9
+ """Download data files from Google Drive."""
10
+
11
+ # Create directory
12
+ Path('data_final/processed').mkdir(parents=True, exist_ok=True)
13
+
14
+ # Download corpus
15
+ if not os.path.exists('data_final/processed/corpus_with_embeddings.json'):
16
+ print("⏳ Downloading corpus...")
17
+ gdown.download(
18
+ id='1LmT3oEt_F4IccKKKqYk6-A7Yy6ipony5', # Replace with your Google Drive file ID
19
+ output='data_final/processed/corpus_with_embeddings.json',
20
+ quiet=False
21
+ )
22
+ print("βœ… Corpus downloaded!")
23
+
24
+ # Download embeddings
25
+ if not os.path.exists('data_final/processed/embeddings.npy'):
26
+ print("⏳ Downloading embeddings...")
27
+ gdown.download(
28
+ id='1XG8_PsXFBjAVRET4pud_sklM_4iPPhdi', # Replace with your Google Drive file ID
29
+ output='data_final/processed/embeddings.npy',
30
+ quiet=False
31
+ )
32
+ print("βœ… Embeddings downloaded!")
33
+
34
+ if __name__ == '__main__':
35
+ download_data()
36
+ print("βœ… All data ready!")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ pydantic==2.5.0
4
+ numpy==1.24.3
5
+ torch==2.1.0
6
+ transformers==4.35.0
7
+ adapters==0.1.0
8
+ scikit-learn==1.3.2
9
+ python-multipart==0.0.6
10
+ gdown==4.7.1
11
+ huggingface-hub==0.19.4