Spaces:

dVd5t
/

aging-theory-analyzer

Running

App Files Files Community

aging-theory-analyzer / app.py

dVd5t

Update app.py

4177824 verified 3 months ago

raw

history blame contribute delete

9.12 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	from pydantic import BaseModel
	from typing import List, Optional
	import logging
	from dataclasses import asdict
	import time

	from api_clients import SemanticScholarAPI
	from llm_providers import GroqExtractor

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(title="Aging Theory Analyzer API", version="1.0.0")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	class CollectRequest(BaseModel):
	queries: List[str]
	year_from: int = 2000
	year_to: int = 2025
	max_papers: int = 100

	class ExtractRequest(BaseModel):
	papers: List[dict]
	llm_provider: str
	llm_api_key: Optional[str] = None

	class BatchExtractRequest(BaseModel):
	"""Обработка одного батча статей"""
	papers: List[dict]
	llm_provider: str
	llm_api_key: str
	batch_number: int
	total_batches: int

	@app.get("/", response_class=HTMLResponse)
	async def root():
	return """
	<html>
	<head>
	<title>Aging Theory Analyzer API</title>
	<style>
	body {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
	max-width: 800px;
	margin: 50px auto;
	padding: 20px;
	background: #f8f9fa;
	}
	h1 { color: #2c3e50; }
	.status { color: #27ae60; font-weight: bold; font-size: 1.2em; }
	.endpoint {
	background: white;
	padding: 15px;
	margin: 10px 0;
	border-radius: 8px;
	border-left: 4px solid #3498db;
	}
	a { color: #3498db; text-decoration: none; }
	a:hover { text-decoration: underline; }
	.features {
	background: #e8f4f8;
	padding: 15px;
	border-radius: 8px;
	margin: 20px 0;
	}
	</style>
	</head>
	<body>
	<h1>🧬 Aging Theory Analyzer API</h1>
	<p class="status">✅ Backend is running on Hugging Face Spaces!</p>

	<div class="features">
	<h3>✨ Features:</h3>
	<ul>
	<li>🔍 Paper collection from Semantic Scholar</li>
	<li>🤖 LLM-powered Q1-Q9 extraction with Groq</li>
	<li>📦 Batch processing for large datasets (10 papers/batch)</li>
	<li>⚡ Automatic rate limiting</li>
	</ul>
	</div>

	<h2>📚 Available Endpoints:</h2>
	<div class="endpoint">
	<strong>GET</strong> <a href="/docs">/docs</a> - Interactive API documentation (Swagger UI)
	</div>
	<div class="endpoint">
	<strong>GET</strong> <a href="/api/health">/api/health</a> - Health check endpoint
	</div>
	<div class="endpoint">
	<strong>POST</strong> /api/collect - Collect papers from Semantic Scholar
	</div>
	<div class="endpoint">
	<strong>POST</strong> /api/extract/batch - Extract Q1-Q9 from one batch (10 papers)
	</div>
	<div class="endpoint">
	<strong>POST</strong> /api/extract - Legacy: Extract all papers at once (not recommended for >50 papers)
	</div>

	<h2>🚀 Quick Start:</h2>
	<ol>
	<li>Copy this URL: <code id="url"></code></li>
	<li>Paste in frontend "Backend URL" field</li>
	<li>Get free Groq API key: <a href="https://console.groq.com/keys" target="_blank">console.groq.com/keys</a></li>
	<li>Add queries and start analyzing!</li>
	</ol>

	<p><small>Version 1.0.0 \| Powered by FastAPI + Groq LLM (Llama 3.1 70B) + Semantic Scholar API</small></p>

	<script>
	document.getElementById('url').textContent = window.location.origin;
	</script>
	</body>
	</html>
	"""

	@app.get("/api/health")
	async def health():
	"""Health check endpoint"""
	return {
	"status": "ok",
	"version": "1.0.0",
	"timestamp": time.time(),
	"message": "Aging Theory Analyzer Backend is running",
	"features": {
	"paper_collection": "Semantic Scholar API",
	"llm_extraction": "Groq (Llama 3.1 70B)",
	"batch_processing": "10 papers per batch"
	}
	}

	@app.post("/api/collect")
	async def collect_papers(request: CollectRequest):
	"""Collect papers from Semantic Scholar"""
	logger.info(f"Collection: {len(request.queries)} queries, {request.year_from}-{request.year_to}")

	all_papers = []
	papers_per_query = request.max_papers // len(request.queries) if request.queries else request.max_papers

	api = SemanticScholarAPI()

	for query in request.queries:
	try:
	papers = await api.search_papers(query, request.year_from, request.year_to, papers_per_query)
	all_papers.extend(papers)
	logger.info(f"Collected {len(papers)} for '{query}'")
	except Exception as e:
	logger.error(f"Error: {e}")

	# Deduplicate
	seen = set()
	unique_papers = []
	for paper in all_papers:
	key = paper.id or paper.title
	if key not in seen:
	seen.add(key)
	unique_papers.append(paper)

	logger.info(f"Collection complete: {len(unique_papers)} unique papers")

	return {
	"papers": [asdict(p) for p in unique_papers],
	"total": len(unique_papers),
	"api_calls": len(unique_papers) // 100 + 1,
	"recommended_batch_size": 10
	}

	@app.post("/api/extract/batch")
	async def extract_batch(request: BatchExtractRequest):
	"""
	Extract Q1-Q9 from ONE batch of papers (recommended: 10 papers per batch)

	Frontend should call this endpoint multiple times for large datasets:
	- Split papers into batches of 10
	- Call this endpoint for each batch
	- Update progress bar after each batch
	"""
	logger.info(f"Batch {request.batch_number}/{request.total_batches}: {len(request.papers)} papers with {request.llm_provider}")

	if not request.llm_api_key:
	raise HTTPException(status_code=400, detail="LLM API key required")

	extractor = GroqExtractor(request.llm_api_key)
	results = []
	failed = []

	for idx, paper in enumerate(request.papers, 1):
	try:
	result = await extractor.extract(paper)
	results.append(result)
	logger.info(f"Batch {request.batch_number}: [{idx}/{len(request.papers)}] ✓ {paper['title'][:50]}")
	except Exception as e:
	failed.append({
	"paper_id": paper.get('id', 'unknown'),
	"paper_title": paper.get('title', 'unknown'),
	"error": str(e)
	})
	logger.error(f"Batch {request.batch_number}: [{idx}/{len(request.papers)}] ✗ {str(e)[:100]}")

	return {
	"batch_number": request.batch_number,
	"total_batches": request.total_batches,
	"results": results,
	"processed": len(results),
	"failed": len(failed),
	"failed_papers": failed,
	"theories_found": sum(1 for r in results if r.get('q2') == 'Yes')
	}

	@app.post("/api/extract")
	async def extract_data(request: ExtractRequest):
	"""
	Legacy endpoint: Extract all papers at once

	⚠️ NOT RECOMMENDED for large datasets (>50 papers)
	Use /api/extract/batch instead for better reliability
	"""
	logger.info(f"Extraction: {len(request.papers)} papers with {request.llm_provider}")

	if not request.llm_api_key:
	raise HTTPException(status_code=400, detail="API key required")

	# Warn if too many papers
	if len(request.papers) > 50:
	logger.warning(f"Large dataset detected ({len(request.papers)} papers). Consider using /api/extract/batch")

	extractor = GroqExtractor(request.llm_api_key)
	results = []
	failed_count = 0

	for idx, paper in enumerate(request.papers, 1):
	try:
	result = await extractor.extract(paper)
	results.append(result)
	logger.info(f"[{idx}/{len(request.papers)}] ✓")
	except Exception as e:
	failed_count += 1
	logger.error(f"[{idx}] Failed: {e}")

	return {
	"results": results,
	"total": len(results),
	"theories_found": sum(1 for r in results if r.get('q2') == 'Yes'),
	"failed": failed_count,
	"warning": "Consider using /api/extract/batch for large datasets" if len(request.papers) > 50 else None
	}