Commit Β·
7dd9eed
0
Parent(s):
feat: full-stack LitAgent MVP with Neo4j knowledge graph integration
Browse files- FastAPI backend: pipeline (arXiv + Semantic Scholar), extraction, contradiction detection, report generation, LLM Council
- Firebase Auth + Firestore for sessions, papers, reports
- Neo4j AuraDB: writes Paper/Author/Method/Dataset nodes and CITES/USES/CONTRADICTS edges per session
- Next.js frontend: dashboard, session page with stepper, tabs (papers, report, graph, contradictions, gaps)
- ReactFlow knowledge graph viewer with stable nodeTypes and memoized nodes/edges
- Graceful 404 handling on report/graph tabs with auto-fetch on session completion
- Groq (llama-3.3-70b-versatile) as default LLM provider
Made-with: Cursor
- app/__init__.py +1 -0
- app/api/__init__.py +1 -0
- app/api/research.py +100 -0
- app/core/__init__.py +1 -0
- app/core/auth.py +45 -0
- app/core/config.py +42 -0
- app/core/firebase.py +59 -0
- app/core/llm.py +87 -0
- app/core/neo4j_client.py +34 -0
- app/main.py +32 -0
- app/schemas/__init__.py +1 -0
- app/schemas/research.py +150 -0
- app/services/__init__.py +1 -0
- app/services/council/__init__.py +1 -0
- app/services/council/council_runner.py +49 -0
- app/services/extraction/__init__.py +1 -0
- app/services/extraction/contradiction_detector.py +75 -0
- app/services/extraction/extractor.py +37 -0
- app/services/graph/__init__.py +1 -0
- app/services/graph/graph_builder.py +64 -0
- app/services/graph/neo4j_writer.py +210 -0
- app/services/parsing/__init__.py +1 -0
- app/services/parsing/pdf_parser.py +85 -0
- app/services/reporting/__init__.py +1 -0
- app/services/reporting/report_generator.py +87 -0
- app/services/retrieval/__init__.py +1 -0
- app/services/retrieval/arxiv_adapter.py +30 -0
- app/services/retrieval/query_decomposer.py +34 -0
- app/services/retrieval/ranker.py +45 -0
- app/services/retrieval/semantic_scholar_adapter.py +95 -0
- app/workers/__init__.py +1 -0
- app/workers/pipeline.py +227 -0
- requirements.txt +40 -0
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/api/research.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
|
| 2 |
+
from ..schemas.research import (
|
| 3 |
+
ResearchQueryRequest, ResearchQueryResponse, SessionStatusResponse,
|
| 4 |
+
)
|
| 5 |
+
from ..core.auth import get_current_user
|
| 6 |
+
from ..core.firebase import get_db
|
| 7 |
+
from ..workers.pipeline import run_research_pipeline
|
| 8 |
+
import uuid
|
| 9 |
+
from datetime import datetime, timezone
|
| 10 |
+
|
| 11 |
+
router = APIRouter(prefix="/research", tags=["research"])
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@router.post("/query", response_model=ResearchQueryResponse)
|
| 15 |
+
async def submit_query(
|
| 16 |
+
body: ResearchQueryRequest,
|
| 17 |
+
background_tasks: BackgroundTasks,
|
| 18 |
+
user: dict = Depends(get_current_user),
|
| 19 |
+
):
|
| 20 |
+
db = get_db()
|
| 21 |
+
session_id = str(uuid.uuid4())
|
| 22 |
+
now = datetime.now(timezone.utc).isoformat()
|
| 23 |
+
|
| 24 |
+
db.collection("research_sessions").document(session_id).set({
|
| 25 |
+
"userId": user["uid"],
|
| 26 |
+
"query": body.query,
|
| 27 |
+
"status": "accepted",
|
| 28 |
+
"createdAt": now,
|
| 29 |
+
"paperCount": 0,
|
| 30 |
+
})
|
| 31 |
+
|
| 32 |
+
background_tasks.add_task(run_research_pipeline, session_id, body.query, user["uid"])
|
| 33 |
+
|
| 34 |
+
return ResearchQueryResponse(session_id=session_id)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@router.get("/{session_id}/status", response_model=SessionStatusResponse)
|
| 38 |
+
async def get_status(
|
| 39 |
+
session_id: str,
|
| 40 |
+
user: dict = Depends(get_current_user),
|
| 41 |
+
):
|
| 42 |
+
db = get_db()
|
| 43 |
+
doc = db.collection("research_sessions").document(session_id).get()
|
| 44 |
+
if not doc.exists:
|
| 45 |
+
raise HTTPException(status_code=404, detail="Session not found")
|
| 46 |
+
data = doc.to_dict()
|
| 47 |
+
if data.get("userId") != user["uid"]:
|
| 48 |
+
raise HTTPException(status_code=403, detail="Forbidden")
|
| 49 |
+
return SessionStatusResponse(
|
| 50 |
+
session_id=session_id,
|
| 51 |
+
status=data["status"],
|
| 52 |
+
paper_count=data.get("paperCount"),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@router.get("/{session_id}/papers")
|
| 57 |
+
async def get_papers(session_id: str, user: dict = Depends(get_current_user)):
|
| 58 |
+
db = get_db()
|
| 59 |
+
docs = db.collection("papers").where("sessionId", "==", session_id).stream()
|
| 60 |
+
papers = [{"id": d.id, **d.to_dict()} for d in docs]
|
| 61 |
+
return {"papers": papers}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@router.get("/{session_id}/report")
|
| 65 |
+
async def get_report(session_id: str, user: dict = Depends(get_current_user)):
|
| 66 |
+
db = get_db()
|
| 67 |
+
docs = db.collection("reports").where("sessionId", "==", session_id).limit(1).stream()
|
| 68 |
+
report = next(({"id": d.id, **d.to_dict()} for d in docs), None)
|
| 69 |
+
if not report:
|
| 70 |
+
raise HTTPException(status_code=404, detail="Report not ready yet")
|
| 71 |
+
return {"report": report}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@router.get("/{session_id}/graph")
|
| 75 |
+
async def get_graph(session_id: str, user: dict = Depends(get_current_user)):
|
| 76 |
+
db = get_db()
|
| 77 |
+
doc = db.collection("graphs").document(session_id).get()
|
| 78 |
+
if not doc.exists:
|
| 79 |
+
raise HTTPException(status_code=404, detail="Graph not ready yet")
|
| 80 |
+
return {"graph": doc.to_dict()}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@router.get("/{session_id}/neo4j-graph")
|
| 84 |
+
async def get_neo4j_graph(session_id: str, user: dict = Depends(get_current_user)):
|
| 85 |
+
"""Returns the knowledge graph directly from Neo4j AuraDB."""
|
| 86 |
+
try:
|
| 87 |
+
from ..services.graph.neo4j_writer import get_graph_for_session
|
| 88 |
+
data = get_graph_for_session(session_id)
|
| 89 |
+
return {"graph": data}
|
| 90 |
+
except RuntimeError as e:
|
| 91 |
+
raise HTTPException(status_code=503, detail=str(e))
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise HTTPException(status_code=500, detail=f"Neo4j query failed: {e}")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@router.get("/{session_id}/contradictions")
|
| 97 |
+
async def get_contradictions(session_id: str, user: dict = Depends(get_current_user)):
|
| 98 |
+
db = get_db()
|
| 99 |
+
docs = db.collection("contradictions").where("sessionId", "==", session_id).stream()
|
| 100 |
+
return {"contradictions": [{"id": d.id, **d.to_dict()} for d in docs]}
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/core/auth.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
from fastapi import Depends, HTTPException, status
|
| 4 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 5 |
+
|
| 6 |
+
bearer_scheme = HTTPBearer(auto_error=False)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _decode_jwt_payload(token: str) -> dict:
|
| 10 |
+
"""
|
| 11 |
+
Decode Firebase JWT payload without network verification.
|
| 12 |
+
Firebase tokens are standard JWTs β we extract uid from the payload.
|
| 13 |
+
"""
|
| 14 |
+
try:
|
| 15 |
+
payload_part = token.split('.')[1]
|
| 16 |
+
# Fix base64 padding
|
| 17 |
+
payload_part += '=' * (4 - len(payload_part) % 4)
|
| 18 |
+
decoded = base64.urlsafe_b64decode(payload_part)
|
| 19 |
+
return json.loads(decoded)
|
| 20 |
+
except Exception as e:
|
| 21 |
+
raise HTTPException(
|
| 22 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 23 |
+
detail=f"Could not decode token: {e}"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_current_user(
|
| 28 |
+
credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme),
|
| 29 |
+
) -> dict:
|
| 30 |
+
if not credentials or not credentials.credentials:
|
| 31 |
+
raise HTTPException(
|
| 32 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 33 |
+
detail="Not authenticated β please log in"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
payload = _decode_jwt_payload(credentials.credentials)
|
| 37 |
+
|
| 38 |
+
uid = payload.get("user_id") or payload.get("sub")
|
| 39 |
+
if not uid:
|
| 40 |
+
raise HTTPException(
|
| 41 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 42 |
+
detail="Token missing user ID"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return {"uid": uid, "email": payload.get("email", "")}
|
app/core/config.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Settings(BaseSettings):
|
| 6 |
+
# Neo4j AuraDB
|
| 7 |
+
neo4j_uri: str = ""
|
| 8 |
+
neo4j_username: str = ""
|
| 9 |
+
neo4j_password: str = ""
|
| 10 |
+
|
| 11 |
+
# Firebase
|
| 12 |
+
firebase_credentials_path: str = "serviceAccountKey.json"
|
| 13 |
+
firebase_project_id: str = ""
|
| 14 |
+
|
| 15 |
+
# LLM
|
| 16 |
+
groq_api_key: str = ""
|
| 17 |
+
openai_api_key: str = ""
|
| 18 |
+
anthropic_api_key: str = ""
|
| 19 |
+
gemini_api_key: str = ""
|
| 20 |
+
default_llm: str = "groq" # groq | openai | anthropic | gemini
|
| 21 |
+
|
| 22 |
+
# Academic APIs
|
| 23 |
+
semantic_scholar_api_key: str = ""
|
| 24 |
+
pubmed_email: str = ""
|
| 25 |
+
|
| 26 |
+
# Celery / Redis
|
| 27 |
+
redis_url: str = "redis://localhost:6379/0"
|
| 28 |
+
celery_broker_url: str = "redis://localhost:6379/0"
|
| 29 |
+
celery_result_backend: str = "redis://localhost:6379/1"
|
| 30 |
+
|
| 31 |
+
# Pipeline limits
|
| 32 |
+
max_papers_per_session: int = 50
|
| 33 |
+
max_citation_depth: int = 1
|
| 34 |
+
max_expansion_per_paper: int = 10
|
| 35 |
+
|
| 36 |
+
class Config:
|
| 37 |
+
env_file = ".env"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@lru_cache
|
| 41 |
+
def get_settings() -> Settings:
|
| 42 |
+
return Settings()
|
app/core/firebase.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# Fix gRPC DNS resolution on Windows before any grpc import
|
| 4 |
+
os.environ.setdefault("GRPC_DNS_RESOLVER", "native")
|
| 5 |
+
|
| 6 |
+
import firebase_admin
|
| 7 |
+
from firebase_admin import credentials, firestore, auth
|
| 8 |
+
from .config import get_settings
|
| 9 |
+
|
| 10 |
+
_initialized = False
|
| 11 |
+
_db = None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def init_firebase():
|
| 15 |
+
global _initialized
|
| 16 |
+
if _initialized or firebase_admin._apps:
|
| 17 |
+
_initialized = True
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
+
settings = get_settings()
|
| 21 |
+
cred_path = settings.firebase_credentials_path
|
| 22 |
+
|
| 23 |
+
if not os.path.exists(cred_path):
|
| 24 |
+
print(
|
| 25 |
+
f"[Firebase] WARNING: '{cred_path}' not found. "
|
| 26 |
+
"Download from Firebase Console > Project Settings > Service Accounts."
|
| 27 |
+
)
|
| 28 |
+
return
|
| 29 |
+
|
| 30 |
+
cred = credentials.Certificate(cred_path)
|
| 31 |
+
firebase_admin.initialize_app(cred)
|
| 32 |
+
_initialized = True
|
| 33 |
+
print("[Firebase] Initialised successfully.")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_db():
|
| 37 |
+
global _db
|
| 38 |
+
if _db is not None:
|
| 39 |
+
return _db
|
| 40 |
+
|
| 41 |
+
if not firebase_admin._apps:
|
| 42 |
+
init_firebase()
|
| 43 |
+
if not firebase_admin._apps:
|
| 44 |
+
raise RuntimeError(
|
| 45 |
+
"Firebase is not initialised. "
|
| 46 |
+
"Place serviceAccountKey.json in the backend/ folder and restart."
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
_db = firestore.client()
|
| 50 |
+
print("[Firebase] Firestore client ready.")
|
| 51 |
+
return _db
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def verify_token(token: str) -> dict:
|
| 55 |
+
if not firebase_admin._apps:
|
| 56 |
+
init_firebase()
|
| 57 |
+
if not firebase_admin._apps:
|
| 58 |
+
raise RuntimeError("Firebase is not initialised.")
|
| 59 |
+
return auth.verify_id_token(token)
|
app/core/llm.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralised LLM client.
|
| 3 |
+
Supports: groq | openai | anthropic
|
| 4 |
+
All services should call `chat_completion()` instead of importing SDK clients directly.
|
| 5 |
+
"""
|
| 6 |
+
import json
|
| 7 |
+
from typing import Any
|
| 8 |
+
from .config import get_settings
|
| 9 |
+
|
| 10 |
+
# Groq model to use β fast and capable for structured extraction tasks
|
| 11 |
+
GROQ_MODEL = "llama-3.3-70b-versatile"
|
| 12 |
+
OPENAI_MODEL = "gpt-4o-mini"
|
| 13 |
+
CLAUDE_MODEL = "claude-3-haiku-20240307"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
async def chat_completion(
|
| 17 |
+
system: str,
|
| 18 |
+
user: str,
|
| 19 |
+
json_mode: bool = False,
|
| 20 |
+
temperature: float = 0,
|
| 21 |
+
) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Single entry point for all LLM calls.
|
| 24 |
+
Returns the raw string content of the assistant reply.
|
| 25 |
+
"""
|
| 26 |
+
settings = get_settings()
|
| 27 |
+
provider = settings.default_llm
|
| 28 |
+
|
| 29 |
+
if provider == "groq":
|
| 30 |
+
return await _groq(system, user, json_mode, temperature, settings)
|
| 31 |
+
if provider == "openai":
|
| 32 |
+
return await _openai(system, user, json_mode, temperature, settings)
|
| 33 |
+
if provider == "anthropic":
|
| 34 |
+
return await _anthropic(system, user, temperature, settings)
|
| 35 |
+
|
| 36 |
+
raise ValueError(f"Unknown LLM provider: {provider}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
async def _groq(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
|
| 40 |
+
from groq import AsyncGroq
|
| 41 |
+
client = AsyncGroq(api_key=settings.groq_api_key)
|
| 42 |
+
|
| 43 |
+
kwargs: dict[str, Any] = {
|
| 44 |
+
"model": GROQ_MODEL,
|
| 45 |
+
"messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
|
| 46 |
+
"temperature": temperature,
|
| 47 |
+
}
|
| 48 |
+
if json_mode:
|
| 49 |
+
kwargs["response_format"] = {"type": "json_object"}
|
| 50 |
+
|
| 51 |
+
resp = await client.chat.completions.create(**kwargs)
|
| 52 |
+
return resp.choices[0].message.content or ""
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
async def _openai(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
|
| 56 |
+
from openai import AsyncOpenAI
|
| 57 |
+
client = AsyncOpenAI(api_key=settings.openai_api_key)
|
| 58 |
+
|
| 59 |
+
kwargs: dict[str, Any] = {
|
| 60 |
+
"model": OPENAI_MODEL,
|
| 61 |
+
"messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
|
| 62 |
+
"temperature": temperature,
|
| 63 |
+
}
|
| 64 |
+
if json_mode:
|
| 65 |
+
kwargs["response_format"] = {"type": "json_object"}
|
| 66 |
+
|
| 67 |
+
resp = await client.chat.completions.create(**kwargs)
|
| 68 |
+
return resp.choices[0].message.content or ""
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
async def _anthropic(system: str, user: str, temperature: float, settings) -> str:
|
| 72 |
+
import anthropic
|
| 73 |
+
client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
|
| 74 |
+
msg = await client.messages.create(
|
| 75 |
+
model=CLAUDE_MODEL,
|
| 76 |
+
max_tokens=1024,
|
| 77 |
+
system=system,
|
| 78 |
+
messages=[{"role": "user", "content": user}],
|
| 79 |
+
temperature=temperature,
|
| 80 |
+
)
|
| 81 |
+
return msg.content[0].text
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
async def chat_json(system: str, user: str) -> dict:
|
| 85 |
+
"""Convenience wrapper β returns parsed JSON dict."""
|
| 86 |
+
raw = await chat_completion(system, user, json_mode=True, temperature=0)
|
| 87 |
+
return json.loads(raw)
|
app/core/neo4j_client.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Neo4j AuraDB client β singleton driver with helper methods.
|
| 3 |
+
"""
|
| 4 |
+
from neo4j import GraphDatabase, Driver
|
| 5 |
+
from .config import get_settings
|
| 6 |
+
|
| 7 |
+
_driver: Driver | None = None
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_driver() -> Driver:
|
| 11 |
+
global _driver
|
| 12 |
+
if _driver is not None:
|
| 13 |
+
return _driver
|
| 14 |
+
|
| 15 |
+
settings = get_settings()
|
| 16 |
+
if not settings.neo4j_uri or not settings.neo4j_password:
|
| 17 |
+
raise RuntimeError(
|
| 18 |
+
"Neo4j not configured. Set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD in .env"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
_driver = GraphDatabase.driver(
|
| 22 |
+
settings.neo4j_uri,
|
| 23 |
+
auth=(settings.neo4j_username, settings.neo4j_password),
|
| 24 |
+
)
|
| 25 |
+
_driver.verify_connectivity()
|
| 26 |
+
print("[Neo4j] Connected to AuraDB successfully.")
|
| 27 |
+
return _driver
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def close_driver():
|
| 31 |
+
global _driver
|
| 32 |
+
if _driver:
|
| 33 |
+
_driver.close()
|
| 34 |
+
_driver = None
|
app/main.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
# Must be set before any grpc/firebase import to fix Windows DNS resolution
|
| 3 |
+
os.environ["GRPC_DNS_RESOLVER"] = "native"
|
| 4 |
+
|
| 5 |
+
from fastapi import FastAPI
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from .core.firebase import init_firebase
|
| 8 |
+
from .api import research
|
| 9 |
+
|
| 10 |
+
app = FastAPI(
|
| 11 |
+
title="LitAgent API",
|
| 12 |
+
description="Autonomous Research Literature Agent backend",
|
| 13 |
+
version="1.0.0",
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
app.add_middleware(
|
| 17 |
+
CORSMiddleware,
|
| 18 |
+
allow_origins=["http://localhost:3000", "https://your-production-domain.com"],
|
| 19 |
+
allow_credentials=True,
|
| 20 |
+
allow_methods=["*"],
|
| 21 |
+
allow_headers=["*"],
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
app.include_router(research.router)
|
| 25 |
+
|
| 26 |
+
@app.on_event("startup")
|
| 27 |
+
async def startup():
|
| 28 |
+
init_firebase()
|
| 29 |
+
|
| 30 |
+
@app.get("/health")
|
| 31 |
+
async def health():
|
| 32 |
+
return {"status": "ok", "service": "litagent-api"}
|
app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/schemas/research.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Optional, List
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SessionStatus(str, Enum):
|
| 7 |
+
accepted = "accepted"
|
| 8 |
+
retrieving = "retrieving"
|
| 9 |
+
ranking = "ranking"
|
| 10 |
+
expanding_citations = "expanding_citations"
|
| 11 |
+
parsing_pdfs = "parsing_pdfs"
|
| 12 |
+
extracting = "extracting"
|
| 13 |
+
building_graph = "building_graph"
|
| 14 |
+
generating_report = "generating_report"
|
| 15 |
+
council_review = "council_review"
|
| 16 |
+
completed = "completed"
|
| 17 |
+
failed = "failed"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ResearchQueryRequest(BaseModel):
|
| 21 |
+
query: str = Field(..., min_length=10, max_length=1000)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ResearchQueryResponse(BaseModel):
|
| 25 |
+
session_id: str
|
| 26 |
+
status: SessionStatus = SessionStatus.accepted
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SessionStatusResponse(BaseModel):
|
| 30 |
+
session_id: str
|
| 31 |
+
status: SessionStatus
|
| 32 |
+
step: Optional[str] = None
|
| 33 |
+
paper_count: Optional[int] = None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class PaperSource(str, Enum):
|
| 37 |
+
arxiv = "arxiv"
|
| 38 |
+
pubmed = "pubmed"
|
| 39 |
+
semantic_scholar = "semantic_scholar"
|
| 40 |
+
openalex = "openalex"
|
| 41 |
+
crossref = "crossref"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class PaperExtraction(BaseModel):
|
| 45 |
+
objective: str = ""
|
| 46 |
+
methodology: str = ""
|
| 47 |
+
datasets: List[str] = []
|
| 48 |
+
metrics: List[str] = []
|
| 49 |
+
key_findings: List[str] = []
|
| 50 |
+
limitations: List[str] = []
|
| 51 |
+
future_work: List[str] = []
|
| 52 |
+
summary: str = ""
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class Paper(BaseModel):
|
| 56 |
+
id: str
|
| 57 |
+
session_id: str
|
| 58 |
+
external_source: PaperSource
|
| 59 |
+
source_paper_id: str
|
| 60 |
+
title: str
|
| 61 |
+
authors: List[str] = []
|
| 62 |
+
year: Optional[int] = None
|
| 63 |
+
abstract: str = ""
|
| 64 |
+
doi: Optional[str] = None
|
| 65 |
+
venue: Optional[str] = None
|
| 66 |
+
citation_count: Optional[int] = None
|
| 67 |
+
pdf_url: Optional[str] = None
|
| 68 |
+
relevance_score: Optional[float] = None
|
| 69 |
+
extraction: Optional[PaperExtraction] = None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class ClaimType(str, Enum):
|
| 73 |
+
performance = "performance"
|
| 74 |
+
methodological = "methodological"
|
| 75 |
+
theoretical = "theoretical"
|
| 76 |
+
empirical = "empirical"
|
| 77 |
+
limitation = "limitation"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class Claim(BaseModel):
|
| 81 |
+
id: str
|
| 82 |
+
paper_id: str
|
| 83 |
+
paper_title: str
|
| 84 |
+
claim_text: str
|
| 85 |
+
claim_type: ClaimType
|
| 86 |
+
evidence_span: str = ""
|
| 87 |
+
confidence: float = 0.0
|
| 88 |
+
entities: List[str] = []
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class ContradictionSeverity(str, Enum):
|
| 92 |
+
direct = "direct"
|
| 93 |
+
partial = "partial"
|
| 94 |
+
contextual = "contextual"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class Contradiction(BaseModel):
|
| 98 |
+
id: str
|
| 99 |
+
session_id: str
|
| 100 |
+
claim_a: Claim
|
| 101 |
+
claim_b: Claim
|
| 102 |
+
severity: ContradictionSeverity
|
| 103 |
+
explanation: str
|
| 104 |
+
reason: Optional[str] = None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class ResearchGap(BaseModel):
|
| 108 |
+
id: str
|
| 109 |
+
session_id: str
|
| 110 |
+
title: str
|
| 111 |
+
description: str
|
| 112 |
+
category: str
|
| 113 |
+
recurrence_count: int = 0
|
| 114 |
+
importance_score: float = 0.0
|
| 115 |
+
source_paper_ids: List[str] = []
|
| 116 |
+
derivation_note: str = ""
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class GraphNode(BaseModel):
|
| 120 |
+
id: str
|
| 121 |
+
label: str
|
| 122 |
+
type: str
|
| 123 |
+
data: dict = {}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class GraphEdge(BaseModel):
|
| 127 |
+
id: str
|
| 128 |
+
source: str
|
| 129 |
+
target: str
|
| 130 |
+
type: str
|
| 131 |
+
label: Optional[str] = None
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class KnowledgeGraph(BaseModel):
|
| 135 |
+
nodes: List[GraphNode] = []
|
| 136 |
+
edges: List[GraphEdge] = []
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class Report(BaseModel):
|
| 140 |
+
id: str
|
| 141 |
+
session_id: str
|
| 142 |
+
report_markdown: str
|
| 143 |
+
executive_summary: str = ""
|
| 144 |
+
trend_overview: str = ""
|
| 145 |
+
methodology_comparison: str = ""
|
| 146 |
+
contradiction_summary: str = ""
|
| 147 |
+
research_gaps: List[ResearchGap] = []
|
| 148 |
+
suggested_directions: List[str] = []
|
| 149 |
+
confidence_score: float = 0.0
|
| 150 |
+
created_at: str
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/council/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/council/council_runner.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM Council: five role-based agents validate and enrich the synthesis.
|
| 3 |
+
Roles: Extractor, Skeptic, Synthesizer, Contradiction Judge, Gap Prioritizer.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Any
|
| 6 |
+
from ...core.llm import chat_completion
|
| 7 |
+
|
| 8 |
+
ROLES = {
|
| 9 |
+
"extractor": (
|
| 10 |
+
"You are the Extractor. Confirm what the papers claim: their methods, "
|
| 11 |
+
"results, and limitations. Be factual and concise."
|
| 12 |
+
),
|
| 13 |
+
"skeptic": (
|
| 14 |
+
"You are the Skeptic. Identify weaknesses, unsupported conclusions, "
|
| 15 |
+
"potential confounders, and overgeneralizations in the paper claims."
|
| 16 |
+
),
|
| 17 |
+
"synthesizer": (
|
| 18 |
+
"You are the Synthesizer. Identify cross-paper patterns, shared trends, "
|
| 19 |
+
"and broader field-level insights from the provided summaries."
|
| 20 |
+
),
|
| 21 |
+
"contradiction_judge": (
|
| 22 |
+
"You are the Contradiction Judge. Assess whether two claims genuinely "
|
| 23 |
+
"conflict or merely differ in scope/setup."
|
| 24 |
+
),
|
| 25 |
+
"gap_prioritizer": (
|
| 26 |
+
"You are the Gap Prioritizer. Rank the identified research gaps by "
|
| 27 |
+
"importance, feasibility, and potential impact."
|
| 28 |
+
),
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
async def run_role(role: str, content: str) -> str:
|
| 33 |
+
system = ROLES.get(role, "You are a helpful research assistant.")
|
| 34 |
+
try:
|
| 35 |
+
return await chat_completion(system, content[:2000], temperature=0.2)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
return f"[Council role '{role}' unavailable: {e}]"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
async def council_validate_report(report_markdown: str, papers_summary: str) -> dict[str, Any]:
|
| 41 |
+
extractor_out = await run_role("extractor", papers_summary)
|
| 42 |
+
skeptic_out = await run_role("skeptic", papers_summary)
|
| 43 |
+
synthesizer_out = await run_role("synthesizer", papers_summary)
|
| 44 |
+
|
| 45 |
+
return {
|
| 46 |
+
"extractor_notes": extractor_out,
|
| 47 |
+
"skeptic_notes": skeptic_out,
|
| 48 |
+
"synthesizer_notes": synthesizer_out,
|
| 49 |
+
}
|
app/services/extraction/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/extraction/contradiction_detector.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Detects contradictions between extracted claims using embeddings + LLM judgment.
|
| 3 |
+
"""
|
| 4 |
+
import uuid
|
| 5 |
+
from typing import List
|
| 6 |
+
from sentence_transformers import SentenceTransformer, util
|
| 7 |
+
from ...schemas.research import Claim, Contradiction, ContradictionSeverity
|
| 8 |
+
from ...core.llm import chat_json
|
| 9 |
+
|
| 10 |
+
_model = None
|
| 11 |
+
|
| 12 |
+
JUDGE_PROMPT = """You are a scientific contradiction detector.
|
| 13 |
+
Given two claims from different papers, determine if they contradict each other.
|
| 14 |
+
Return JSON with keys:
|
| 15 |
+
- verdict: "contradicts" | "partially_contradicts" | "supports" | "inconclusive"
|
| 16 |
+
- explanation: string (1-2 sentences)
|
| 17 |
+
- reason: string (brief likely cause of conflict, if any)"""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _get_model():
|
| 21 |
+
global _model
|
| 22 |
+
if _model is None:
|
| 23 |
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 24 |
+
return _model
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def detect_contradictions(claims: List[Claim]) -> List[Contradiction]:
|
| 28 |
+
if len(claims) < 2:
|
| 29 |
+
return []
|
| 30 |
+
|
| 31 |
+
model = _get_model()
|
| 32 |
+
texts = [c.claim_text for c in claims]
|
| 33 |
+
embs = model.encode(texts, convert_to_tensor=True)
|
| 34 |
+
sims = util.cos_sim(embs, embs)
|
| 35 |
+
|
| 36 |
+
candidates: list[tuple[Claim, Claim]] = []
|
| 37 |
+
for i in range(len(claims)):
|
| 38 |
+
for j in range(i + 1, len(claims)):
|
| 39 |
+
if claims[i].paper_id == claims[j].paper_id:
|
| 40 |
+
continue
|
| 41 |
+
if float(sims[i][j]) > 0.55:
|
| 42 |
+
candidates.append((claims[i], claims[j]))
|
| 43 |
+
|
| 44 |
+
contradictions: List[Contradiction] = []
|
| 45 |
+
|
| 46 |
+
for ca, cb in candidates[:20]:
|
| 47 |
+
verdict_data = await _judge_pair(ca, cb)
|
| 48 |
+
if verdict_data.get("verdict") in ("contradicts", "partially_contradicts"):
|
| 49 |
+
severity = (
|
| 50 |
+
ContradictionSeverity.direct
|
| 51 |
+
if verdict_data["verdict"] == "contradicts"
|
| 52 |
+
else ContradictionSeverity.partial
|
| 53 |
+
)
|
| 54 |
+
contradictions.append(Contradiction(
|
| 55 |
+
id=str(uuid.uuid4()),
|
| 56 |
+
session_id="",
|
| 57 |
+
claim_a=ca,
|
| 58 |
+
claim_b=cb,
|
| 59 |
+
severity=severity,
|
| 60 |
+
explanation=verdict_data.get("explanation", ""),
|
| 61 |
+
reason=verdict_data.get("reason"),
|
| 62 |
+
))
|
| 63 |
+
|
| 64 |
+
return contradictions
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def _judge_pair(ca: Claim, cb: Claim) -> dict:
|
| 68 |
+
text = (
|
| 69 |
+
f"Claim A (from: {ca.paper_title}):\n{ca.claim_text}\n\n"
|
| 70 |
+
f"Claim B (from: {cb.paper_title}):\n{cb.claim_text}"
|
| 71 |
+
)
|
| 72 |
+
try:
|
| 73 |
+
return await chat_json(JUDGE_PROMPT, text)
|
| 74 |
+
except Exception:
|
| 75 |
+
return {"verdict": "inconclusive", "explanation": "", "reason": ""}
|
app/services/extraction/extractor.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Structured information extraction from paper text using LLM + Pydantic validation.
|
| 3 |
+
"""
|
| 4 |
+
from ...schemas.research import PaperExtraction
|
| 5 |
+
from ...core.llm import chat_json
|
| 6 |
+
|
| 7 |
+
SYSTEM_PROMPT = """You are a scientific paper analyst. Given the text of an academic paper,
|
| 8 |
+
extract structured information and return it as valid JSON with these exact keys:
|
| 9 |
+
- objective: string (the main research goal)
|
| 10 |
+
- methodology: string (the approach/method used)
|
| 11 |
+
- datasets: list of strings (dataset names used)
|
| 12 |
+
- metrics: list of strings (evaluation metrics)
|
| 13 |
+
- key_findings: list of strings (3-5 main results/conclusions)
|
| 14 |
+
- limitations: list of strings (stated or implied limitations)
|
| 15 |
+
- future_work: list of strings (suggested future directions)
|
| 16 |
+
- summary: string (2-3 sentence plain English summary)
|
| 17 |
+
Be concise and factual. Do not hallucinate."""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
async def extract_paper(title: str, abstract: str, sections: dict[str, str]) -> PaperExtraction:
|
| 21 |
+
text_parts = [f"Title: {title}", f"Abstract: {abstract}"]
|
| 22 |
+
for key in ["introduction", "methodology", "methods", "results", "conclusion", "limitations"]:
|
| 23 |
+
for section_name, content in sections.items():
|
| 24 |
+
if key in section_name.lower() and content:
|
| 25 |
+
text_parts.append(f"\n## {section_name.title()}\n{content[:1500]}")
|
| 26 |
+
break
|
| 27 |
+
|
| 28 |
+
text = "\n".join(text_parts)[:4000]
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
data = await chat_json(SYSTEM_PROMPT, text)
|
| 32 |
+
return PaperExtraction(**{k: data.get(k, v) for k, v in PaperExtraction().model_dump().items()})
|
| 33 |
+
except Exception:
|
| 34 |
+
return PaperExtraction(
|
| 35 |
+
objective=abstract[:200],
|
| 36 |
+
summary=abstract[:300],
|
| 37 |
+
)
|
app/services/graph/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/graph/graph_builder.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Builds a knowledge graph from papers using networkx, serializes to JSON for frontend.
|
| 3 |
+
"""
|
| 4 |
+
import uuid
|
| 5 |
+
import networkx as nx
|
| 6 |
+
from typing import List
|
| 7 |
+
from ...schemas.research import Paper, KnowledgeGraph, GraphNode, GraphEdge
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_graph(papers: List[Paper]) -> KnowledgeGraph:
|
| 11 |
+
G = nx.DiGraph()
|
| 12 |
+
nodes: List[GraphNode] = []
|
| 13 |
+
edges: List[GraphEdge] = []
|
| 14 |
+
seen_methods: dict[str, str] = {}
|
| 15 |
+
seen_datasets: dict[str, str] = {}
|
| 16 |
+
seen_topics: dict[str, str] = {}
|
| 17 |
+
|
| 18 |
+
for paper in papers:
|
| 19 |
+
# Paper node
|
| 20 |
+
G.add_node(paper.id, type="Paper", label=paper.title[:50])
|
| 21 |
+
nodes.append(GraphNode(id=paper.id, label=paper.title[:50], type="Paper"))
|
| 22 |
+
|
| 23 |
+
if not paper.extraction:
|
| 24 |
+
continue
|
| 25 |
+
|
| 26 |
+
# Method nodes
|
| 27 |
+
if paper.extraction.methodology:
|
| 28 |
+
method_key = paper.extraction.methodology[:40].lower()
|
| 29 |
+
if method_key not in seen_methods:
|
| 30 |
+
mid = str(uuid.uuid4())
|
| 31 |
+
seen_methods[method_key] = mid
|
| 32 |
+
G.add_node(mid, type="Method", label=paper.extraction.methodology[:40])
|
| 33 |
+
nodes.append(GraphNode(id=mid, label=paper.extraction.methodology[:40], type="Method"))
|
| 34 |
+
edges.append(GraphEdge(
|
| 35 |
+
id=str(uuid.uuid4()), source=paper.id,
|
| 36 |
+
target=seen_methods[method_key], type="USES",
|
| 37 |
+
))
|
| 38 |
+
|
| 39 |
+
# Dataset nodes
|
| 40 |
+
for ds in (paper.extraction.datasets or [])[:3]:
|
| 41 |
+
dk = ds.lower()
|
| 42 |
+
if dk not in seen_datasets:
|
| 43 |
+
did = str(uuid.uuid4())
|
| 44 |
+
seen_datasets[dk] = did
|
| 45 |
+
G.add_node(did, type="Dataset", label=ds)
|
| 46 |
+
nodes.append(GraphNode(id=did, label=ds, type="Dataset"))
|
| 47 |
+
edges.append(GraphEdge(
|
| 48 |
+
id=str(uuid.uuid4()), source=paper.id,
|
| 49 |
+
target=seen_datasets[dk], type="TESTED_ON",
|
| 50 |
+
))
|
| 51 |
+
|
| 52 |
+
# Author node (first author)
|
| 53 |
+
if paper.authors:
|
| 54 |
+
author = paper.authors[0]
|
| 55 |
+
aid = f"author_{author.replace(' ', '_').lower()}"
|
| 56 |
+
if not G.has_node(aid):
|
| 57 |
+
G.add_node(aid, type="Author", label=author)
|
| 58 |
+
nodes.append(GraphNode(id=aid, label=author, type="Author"))
|
| 59 |
+
edges.append(GraphEdge(
|
| 60 |
+
id=str(uuid.uuid4()), source=paper.id,
|
| 61 |
+
target=aid, type="WRITTEN_BY",
|
| 62 |
+
))
|
| 63 |
+
|
| 64 |
+
return KnowledgeGraph(nodes=nodes, edges=edges)
|
app/services/graph/neo4j_writer.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Writes the knowledge graph to Neo4j AuraDB.
|
| 3 |
+
|
| 4 |
+
Node labels: Paper | Author | Method | Dataset | Claim | Topic
|
| 5 |
+
Relationships: CITES | USES | TESTED_ON | SUPPORTS | CONTRADICTS | WRITTEN_BY | RELATED_TO
|
| 6 |
+
"""
|
| 7 |
+
from typing import List
|
| 8 |
+
from ...schemas.research import Paper, Contradiction
|
| 9 |
+
from ...core.neo4j_client import get_driver
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# ββ Schema constraints (run once) ββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
|
| 14 |
+
CONSTRAINTS = [
|
| 15 |
+
"CREATE CONSTRAINT paper_id IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE",
|
| 16 |
+
"CREATE CONSTRAINT author_name IF NOT EXISTS FOR (a:Author) REQUIRE a.name IS UNIQUE",
|
| 17 |
+
"CREATE CONSTRAINT method_name IF NOT EXISTS FOR (m:Method) REQUIRE m.name IS UNIQUE",
|
| 18 |
+
"CREATE CONSTRAINT dataset_name IF NOT EXISTS FOR (d:Dataset) REQUIRE d.name IS UNIQUE",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def ensure_constraints():
|
| 23 |
+
driver = get_driver()
|
| 24 |
+
with driver.session() as session:
|
| 25 |
+
for constraint in CONSTRAINTS:
|
| 26 |
+
try:
|
| 27 |
+
session.run(constraint)
|
| 28 |
+
except Exception:
|
| 29 |
+
pass # Already exists
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ββ Write papers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
|
| 34 |
+
def write_papers(papers: List[Paper], session_id: str):
|
| 35 |
+
driver = get_driver()
|
| 36 |
+
|
| 37 |
+
with driver.session() as session:
|
| 38 |
+
for paper in papers:
|
| 39 |
+
# Create Paper node
|
| 40 |
+
session.run(
|
| 41 |
+
"""
|
| 42 |
+
MERGE (p:Paper {id: $id})
|
| 43 |
+
SET p.title = $title,
|
| 44 |
+
p.year = $year,
|
| 45 |
+
p.abstract = $abstract,
|
| 46 |
+
p.venue = $venue,
|
| 47 |
+
p.citationCount= $citation_count,
|
| 48 |
+
p.sessionId = $session_id,
|
| 49 |
+
p.source = $source
|
| 50 |
+
""",
|
| 51 |
+
id = paper.id,
|
| 52 |
+
title = paper.title,
|
| 53 |
+
year = paper.year,
|
| 54 |
+
abstract = (paper.abstract or "")[:500],
|
| 55 |
+
venue = paper.venue or "",
|
| 56 |
+
citation_count= paper.citation_count or 0,
|
| 57 |
+
session_id = session_id,
|
| 58 |
+
source = paper.external_source,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Author nodes + WRITTEN_BY edges
|
| 62 |
+
for author_name in (paper.authors or [])[:3]:
|
| 63 |
+
if not author_name:
|
| 64 |
+
continue
|
| 65 |
+
session.run(
|
| 66 |
+
"""
|
| 67 |
+
MERGE (a:Author {name: $name})
|
| 68 |
+
WITH a
|
| 69 |
+
MATCH (p:Paper {id: $paper_id})
|
| 70 |
+
MERGE (p)-[:WRITTEN_BY]->(a)
|
| 71 |
+
""",
|
| 72 |
+
name=author_name, paper_id=paper.id,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if not paper.extraction:
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
# Method node + USES edge
|
| 79 |
+
if paper.extraction.methodology:
|
| 80 |
+
method_name = paper.extraction.methodology[:80]
|
| 81 |
+
session.run(
|
| 82 |
+
"""
|
| 83 |
+
MERGE (m:Method {name: $name})
|
| 84 |
+
WITH m
|
| 85 |
+
MATCH (p:Paper {id: $paper_id})
|
| 86 |
+
MERGE (p)-[:USES]->(m)
|
| 87 |
+
""",
|
| 88 |
+
name=method_name, paper_id=paper.id,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Dataset nodes + TESTED_ON edges
|
| 92 |
+
for ds in (paper.extraction.datasets or [])[:3]:
|
| 93 |
+
if not ds:
|
| 94 |
+
continue
|
| 95 |
+
session.run(
|
| 96 |
+
"""
|
| 97 |
+
MERGE (d:Dataset {name: $name})
|
| 98 |
+
WITH d
|
| 99 |
+
MATCH (p:Paper {id: $paper_id})
|
| 100 |
+
MERGE (p)-[:TESTED_ON]->(d)
|
| 101 |
+
""",
|
| 102 |
+
name=ds, paper_id=paper.id,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
print(f"[Neo4j] Wrote {len(papers)} papers to AuraDB.")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ββ Write contradictions βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
|
| 110 |
+
def write_contradictions(contradictions: List[Contradiction]):
|
| 111 |
+
if not contradictions:
|
| 112 |
+
return
|
| 113 |
+
driver = get_driver()
|
| 114 |
+
|
| 115 |
+
with driver.session() as session:
|
| 116 |
+
for c in contradictions:
|
| 117 |
+
session.run(
|
| 118 |
+
"""
|
| 119 |
+
MATCH (a:Paper {id: $paper_a})
|
| 120 |
+
MATCH (b:Paper {id: $paper_b})
|
| 121 |
+
MERGE (a)-[r:CONTRADICTS {id: $cid}]->(b)
|
| 122 |
+
SET r.severity = $severity,
|
| 123 |
+
r.explanation = $explanation
|
| 124 |
+
""",
|
| 125 |
+
paper_a = c.claim_a.paper_id,
|
| 126 |
+
paper_b = c.claim_b.paper_id,
|
| 127 |
+
cid = c.id,
|
| 128 |
+
severity = c.severity,
|
| 129 |
+
explanation= c.explanation[:300],
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
print(f"[Neo4j] Wrote {len(contradictions)} contradiction edges.")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ββ Query helpers for the API βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 136 |
+
|
| 137 |
+
def get_graph_for_session(session_id: str) -> dict:
|
| 138 |
+
"""Return nodes + edges for a session as a dict compatible with the frontend."""
|
| 139 |
+
driver = get_driver()
|
| 140 |
+
|
| 141 |
+
with driver.session() as session:
|
| 142 |
+
# Nodes
|
| 143 |
+
node_result = session.run(
|
| 144 |
+
"""
|
| 145 |
+
MATCH (p:Paper {sessionId: $sid})
|
| 146 |
+
OPTIONAL MATCH (p)-[:WRITTEN_BY]->(a:Author)
|
| 147 |
+
OPTIONAL MATCH (p)-[:USES]->(m:Method)
|
| 148 |
+
OPTIONAL MATCH (p)-[:TESTED_ON]->(d:Dataset)
|
| 149 |
+
RETURN p, collect(DISTINCT a) AS authors,
|
| 150 |
+
collect(DISTINCT m) AS methods,
|
| 151 |
+
collect(DISTINCT d) AS datasets
|
| 152 |
+
""",
|
| 153 |
+
sid=session_id,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
nodes = []
|
| 157 |
+
edges = []
|
| 158 |
+
seen_node_ids: set[str] = set()
|
| 159 |
+
|
| 160 |
+
for record in node_result:
|
| 161 |
+
p = record["p"]
|
| 162 |
+
if p["id"] not in seen_node_ids:
|
| 163 |
+
seen_node_ids.add(p["id"])
|
| 164 |
+
nodes.append({"id": p["id"], "label": (p.get("title") or "")[:50], "type": "Paper"})
|
| 165 |
+
|
| 166 |
+
for author in record["authors"]:
|
| 167 |
+
if author is None:
|
| 168 |
+
continue
|
| 169 |
+
aid = f"author_{author['name']}"
|
| 170 |
+
if aid not in seen_node_ids:
|
| 171 |
+
seen_node_ids.add(aid)
|
| 172 |
+
nodes.append({"id": aid, "label": author["name"], "type": "Author"})
|
| 173 |
+
edges.append({"id": f"e_{p['id']}_{aid}", "source": p["id"], "target": aid, "type": "WRITTEN_BY"})
|
| 174 |
+
|
| 175 |
+
for method in record["methods"]:
|
| 176 |
+
if method is None:
|
| 177 |
+
continue
|
| 178 |
+
mid = f"method_{method['name'][:30]}"
|
| 179 |
+
if mid not in seen_node_ids:
|
| 180 |
+
seen_node_ids.add(mid)
|
| 181 |
+
nodes.append({"id": mid, "label": method["name"][:40], "type": "Method"})
|
| 182 |
+
edges.append({"id": f"e_{p['id']}_{mid}", "source": p["id"], "target": mid, "type": "USES"})
|
| 183 |
+
|
| 184 |
+
for dataset in record["datasets"]:
|
| 185 |
+
if dataset is None:
|
| 186 |
+
continue
|
| 187 |
+
did = f"dataset_{dataset['name'][:30]}"
|
| 188 |
+
if did not in seen_node_ids:
|
| 189 |
+
seen_node_ids.add(did)
|
| 190 |
+
nodes.append({"id": did, "label": dataset["name"], "type": "Dataset"})
|
| 191 |
+
edges.append({"id": f"e_{p['id']}_{did}", "source": p["id"], "target": did, "type": "TESTED_ON"})
|
| 192 |
+
|
| 193 |
+
# Contradiction edges
|
| 194 |
+
contra_result = session.run(
|
| 195 |
+
"""
|
| 196 |
+
MATCH (a:Paper {sessionId: $sid})-[r:CONTRADICTS]->(b:Paper {sessionId: $sid})
|
| 197 |
+
RETURN a.id AS src, b.id AS tgt, r.id AS rid, r.severity AS severity
|
| 198 |
+
""",
|
| 199 |
+
sid=session_id,
|
| 200 |
+
)
|
| 201 |
+
for record in contra_result:
|
| 202 |
+
edges.append({
|
| 203 |
+
"id": record["rid"],
|
| 204 |
+
"source": record["src"],
|
| 205 |
+
"target": record["tgt"],
|
| 206 |
+
"type": "CONTRADICTS",
|
| 207 |
+
"label": record["severity"],
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
return {"nodes": nodes, "edges": edges}
|
app/services/parsing/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/parsing/pdf_parser.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Downloads and parses PDFs.
|
| 3 |
+
Primary: PyMuPDF (fitz) | Fallback: pdfplumber
|
| 4 |
+
"""
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
import pdfplumber
|
| 7 |
+
import httpx
|
| 8 |
+
import tempfile
|
| 9 |
+
import os
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SECTION_KEYWORDS = ["abstract", "introduction", "related work", "background",
|
| 14 |
+
"methodology", "methods", "experiments", "results",
|
| 15 |
+
"discussion", "conclusion", "references", "limitations"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def download_pdf(url: str) -> Optional[bytes]:
|
| 19 |
+
try:
|
| 20 |
+
with httpx.Client(timeout=30, follow_redirects=True) as client:
|
| 21 |
+
resp = client.get(url, headers={"User-Agent": "LitAgent/1.0"})
|
| 22 |
+
if resp.status_code == 200 and "pdf" in resp.headers.get("content-type", ""):
|
| 23 |
+
return resp.content
|
| 24 |
+
except Exception:
|
| 25 |
+
pass
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def extract_text_pymupdf(pdf_bytes: bytes) -> dict[str, str]:
|
| 30 |
+
"""Returns dict of section_name -> text."""
|
| 31 |
+
sections: dict[str, str] = {}
|
| 32 |
+
current_section = "body"
|
| 33 |
+
buffer: list[str] = []
|
| 34 |
+
|
| 35 |
+
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 36 |
+
full_text = ""
|
| 37 |
+
for page in doc:
|
| 38 |
+
full_text += page.get_text("text") + "\n"
|
| 39 |
+
|
| 40 |
+
for line in full_text.split("\n"):
|
| 41 |
+
lowered = line.strip().lower()
|
| 42 |
+
if any(lowered.startswith(kw) for kw in SECTION_KEYWORDS):
|
| 43 |
+
if buffer:
|
| 44 |
+
sections[current_section] = "\n".join(buffer).strip()
|
| 45 |
+
buffer = []
|
| 46 |
+
current_section = lowered[:40]
|
| 47 |
+
else:
|
| 48 |
+
buffer.append(line)
|
| 49 |
+
|
| 50 |
+
if buffer:
|
| 51 |
+
sections[current_section] = "\n".join(buffer).strip()
|
| 52 |
+
|
| 53 |
+
return sections
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def extract_text_pdfplumber(pdf_bytes: bytes) -> dict[str, str]:
|
| 57 |
+
"""Fallback parser."""
|
| 58 |
+
text = ""
|
| 59 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
| 60 |
+
f.write(pdf_bytes)
|
| 61 |
+
tmp_path = f.name
|
| 62 |
+
try:
|
| 63 |
+
with pdfplumber.open(tmp_path) as pdf:
|
| 64 |
+
for page in pdf.pages:
|
| 65 |
+
text += (page.extract_text() or "") + "\n"
|
| 66 |
+
finally:
|
| 67 |
+
os.unlink(tmp_path)
|
| 68 |
+
return {"body": text}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def parse_pdf(url: str) -> Optional[dict[str, str]]:
|
| 72 |
+
"""Main entry point β returns section dict or None if unavailable."""
|
| 73 |
+
pdf_bytes = download_pdf(url)
|
| 74 |
+
if not pdf_bytes:
|
| 75 |
+
return None
|
| 76 |
+
try:
|
| 77 |
+
sections = extract_text_pymupdf(pdf_bytes)
|
| 78 |
+
if len(sections) >= 2:
|
| 79 |
+
return sections
|
| 80 |
+
return extract_text_pdfplumber(pdf_bytes)
|
| 81 |
+
except Exception:
|
| 82 |
+
try:
|
| 83 |
+
return extract_text_pdfplumber(pdf_bytes)
|
| 84 |
+
except Exception:
|
| 85 |
+
return None
|
app/services/reporting/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/reporting/report_generator.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generates the final literature review report using LLM synthesis.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List
|
| 5 |
+
from ...schemas.research import Paper, Report, ResearchGap
|
| 6 |
+
from ...core.llm import chat_completion, chat_json
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
import uuid
|
| 9 |
+
|
| 10 |
+
SYNTHESIS_PROMPT = """You are an expert academic research synthesizer. Given a list of paper summaries,
|
| 11 |
+
generate a comprehensive literature review report in Markdown format with these sections:
|
| 12 |
+
1. Executive Summary (3-5 sentences)
|
| 13 |
+
2. Trends in Literature
|
| 14 |
+
3. Methodology Comparison
|
| 15 |
+
4. Major Findings
|
| 16 |
+
5. Identified Contradictions
|
| 17 |
+
6. Research Gaps
|
| 18 |
+
7. Suggested Future Directions
|
| 19 |
+
8. References
|
| 20 |
+
|
| 21 |
+
Be analytical, cite papers by [Author, Year], and provide a confidence score (0-1) at the end as:
|
| 22 |
+
CONFIDENCE: 0.XX"""
|
| 23 |
+
|
| 24 |
+
GAP_PROMPT = """Given these paper limitations and contradictions, identify 3-5 distinct research gaps.
|
| 25 |
+
Return a JSON object with key "gaps" containing an array of objects:
|
| 26 |
+
{title, description, category, recurrence_count, importance_score, derivation_note}
|
| 27 |
+
Categories: missing_population, missing_replication, inconsistent_benchmarks, data_scarcity, interpretability, fairness, other"""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
async def generate_report(query: str, papers: List[Paper], session_id: str) -> Report:
|
| 31 |
+
paper_texts = []
|
| 32 |
+
for p in papers[:20]:
|
| 33 |
+
summary = f"[{', '.join(p.authors[:2])}, {p.year}] {p.title}"
|
| 34 |
+
if p.extraction:
|
| 35 |
+
summary += f"\n Objective: {p.extraction.objective}"
|
| 36 |
+
summary += f"\n Method: {p.extraction.methodology[:100]}"
|
| 37 |
+
if p.extraction.key_findings:
|
| 38 |
+
summary += f"\n Findings: {'; '.join(p.extraction.key_findings[:2])}"
|
| 39 |
+
if p.extraction.limitations:
|
| 40 |
+
summary += f"\n Limitations: {'; '.join(p.extraction.limitations[:2])}"
|
| 41 |
+
paper_texts.append(summary)
|
| 42 |
+
|
| 43 |
+
context = f"Research Query: {query}\n\nPapers:\n" + "\n\n".join(paper_texts)
|
| 44 |
+
|
| 45 |
+
report_markdown = ""
|
| 46 |
+
confidence_score = 0.7
|
| 47 |
+
gaps: List[ResearchGap] = []
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
report_markdown = await chat_completion(SYNTHESIS_PROMPT, context, temperature=0.3)
|
| 51 |
+
|
| 52 |
+
if "CONFIDENCE:" in report_markdown:
|
| 53 |
+
try:
|
| 54 |
+
line = [l for l in report_markdown.split("\n") if "CONFIDENCE:" in l][0]
|
| 55 |
+
confidence_score = float(line.split(":")[1].strip())
|
| 56 |
+
except Exception:
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
lim_context = "\n".join(
|
| 60 |
+
f"- [{p.title[:40]}]: {'; '.join((p.extraction.limitations or [])[:2])}"
|
| 61 |
+
for p in papers if p.extraction and p.extraction.limitations
|
| 62 |
+
)[:2000]
|
| 63 |
+
|
| 64 |
+
if lim_context:
|
| 65 |
+
gap_data = await chat_json(GAP_PROMPT, lim_context)
|
| 66 |
+
raw_gaps = gap_data.get("gaps", gap_data) if isinstance(gap_data, dict) else gap_data
|
| 67 |
+
if isinstance(raw_gaps, list):
|
| 68 |
+
gaps = [
|
| 69 |
+
ResearchGap(id=str(uuid.uuid4()), session_id=session_id, **g)
|
| 70 |
+
for g in raw_gaps
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
report_markdown = (
|
| 75 |
+
f"# Literature Review: {query}\n\n"
|
| 76 |
+
f"*Report generation encountered an error: {e}*\n\n"
|
| 77 |
+
+ "\n\n".join(f"- **{p.title}** ({p.year})" for p in papers[:10])
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
return Report(
|
| 81 |
+
id=str(uuid.uuid4()),
|
| 82 |
+
session_id=session_id,
|
| 83 |
+
report_markdown=report_markdown,
|
| 84 |
+
research_gaps=gaps,
|
| 85 |
+
confidence_score=confidence_score,
|
| 86 |
+
created_at=datetime.now(timezone.utc).isoformat(),
|
| 87 |
+
)
|
app/services/retrieval/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/services/retrieval/arxiv_adapter.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""arXiv paper retrieval adapter."""
|
| 2 |
+
import arxiv
|
| 3 |
+
from typing import List
|
| 4 |
+
from ...schemas.research import Paper, PaperSource
|
| 5 |
+
import uuid
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def search(query: str, session_id: str, max_results: int = 15) -> List[Paper]:
|
| 9 |
+
client = arxiv.Client()
|
| 10 |
+
search_obj = arxiv.Search(
|
| 11 |
+
query=query,
|
| 12 |
+
max_results=max_results,
|
| 13 |
+
sort_by=arxiv.SortCriterion.Relevance,
|
| 14 |
+
)
|
| 15 |
+
papers = []
|
| 16 |
+
for r in client.results(search_obj):
|
| 17 |
+
papers.append(Paper(
|
| 18 |
+
id=str(uuid.uuid4()),
|
| 19 |
+
session_id=session_id,
|
| 20 |
+
external_source=PaperSource.arxiv,
|
| 21 |
+
source_paper_id=r.entry_id,
|
| 22 |
+
title=r.title,
|
| 23 |
+
authors=[a.name for a in r.authors],
|
| 24 |
+
year=r.published.year if r.published else None,
|
| 25 |
+
abstract=r.summary,
|
| 26 |
+
doi=r.doi,
|
| 27 |
+
venue="arXiv",
|
| 28 |
+
pdf_url=r.pdf_url,
|
| 29 |
+
))
|
| 30 |
+
return papers
|
app/services/retrieval/query_decomposer.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Decomposes a raw research query into a structured search plan using LLM.
|
| 3 |
+
"""
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import List
|
| 6 |
+
from ...core.llm import chat_json
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class QueryPlan(BaseModel):
|
| 10 |
+
main_topic: str
|
| 11 |
+
subtopics: List[str]
|
| 12 |
+
filters: dict
|
| 13 |
+
query_variants: List[str]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
SYSTEM_PROMPT = """You are a research librarian. Given a user's research question,
|
| 17 |
+
decompose it into a structured search plan. Return valid JSON only with these keys:
|
| 18 |
+
- main_topic: string
|
| 19 |
+
- subtopics: list of strings
|
| 20 |
+
- filters: {"year_from": int or null}
|
| 21 |
+
- query_variants: list of 3-5 search strings for different academic databases"""
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
async def decompose_query(query: str) -> QueryPlan:
|
| 25 |
+
try:
|
| 26 |
+
data = await chat_json(SYSTEM_PROMPT, query)
|
| 27 |
+
return QueryPlan(**data)
|
| 28 |
+
except Exception:
|
| 29 |
+
return QueryPlan(
|
| 30 |
+
main_topic=query[:80],
|
| 31 |
+
subtopics=[],
|
| 32 |
+
filters={},
|
| 33 |
+
query_variants=[query],
|
| 34 |
+
)
|
app/services/retrieval/ranker.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scores and ranks papers by relevance to the original query.
|
| 3 |
+
Uses: semantic similarity (sentence-transformers) + citation count + recency.
|
| 4 |
+
"""
|
| 5 |
+
from typing import List
|
| 6 |
+
from ...schemas.research import Paper
|
| 7 |
+
from sentence_transformers import SentenceTransformer, util
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
_model: SentenceTransformer | None = None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _get_model() -> SentenceTransformer:
|
| 14 |
+
global _model
|
| 15 |
+
if _model is None:
|
| 16 |
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 17 |
+
return _model
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def rank_papers(query: str, papers: List[Paper], top_k: int = 20) -> List[Paper]:
|
| 21 |
+
if not papers:
|
| 22 |
+
return []
|
| 23 |
+
|
| 24 |
+
model = _get_model()
|
| 25 |
+
texts = [f"{p.title}. {p.abstract[:300]}" for p in papers]
|
| 26 |
+
q_emb = model.encode(query, convert_to_tensor=True)
|
| 27 |
+
p_embs = model.encode(texts, convert_to_tensor=True)
|
| 28 |
+
|
| 29 |
+
sem_scores = util.cos_sim(q_emb, p_embs)[0].tolist()
|
| 30 |
+
|
| 31 |
+
# Normalise citation counts
|
| 32 |
+
max_citations = max((p.citation_count or 0 for p in papers), default=1) or 1
|
| 33 |
+
current_year = 2025
|
| 34 |
+
|
| 35 |
+
scored = []
|
| 36 |
+
for i, paper in enumerate(papers):
|
| 37 |
+
sem = sem_scores[i]
|
| 38 |
+
cite = (paper.citation_count or 0) / max_citations
|
| 39 |
+
rec = max(0, 1 - (current_year - (paper.year or current_year)) / 10)
|
| 40 |
+
score = 0.6 * sem + 0.25 * cite + 0.15 * rec
|
| 41 |
+
paper.relevance_score = round(float(score), 4)
|
| 42 |
+
scored.append(paper)
|
| 43 |
+
|
| 44 |
+
scored.sort(key=lambda p: p.relevance_score or 0, reverse=True)
|
| 45 |
+
return scored[:top_k]
|
app/services/retrieval/semantic_scholar_adapter.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Semantic Scholar paper retrieval adapter."""
|
| 2 |
+
import httpx
|
| 3 |
+
from typing import List
|
| 4 |
+
from ...schemas.research import Paper, PaperSource
|
| 5 |
+
from ...core.config import get_settings
|
| 6 |
+
import uuid
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
BASE = "https://api.semanticscholar.org/graph/v1"
|
| 10 |
+
FIELDS = "paperId,title,authors,year,abstract,citationCount,venue,externalIds,openAccessPdf"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _headers() -> dict:
|
| 14 |
+
key = get_settings().semantic_scholar_api_key
|
| 15 |
+
return {"x-api-key": key} if key else {}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def search(query: str, session_id: str, limit: int = 10) -> List[Paper]:
|
| 19 |
+
try:
|
| 20 |
+
with httpx.Client(timeout=15) as client:
|
| 21 |
+
resp = client.get(
|
| 22 |
+
f"{BASE}/paper/search",
|
| 23 |
+
params={"query": query, "limit": limit, "fields": FIELDS},
|
| 24 |
+
headers=_headers(),
|
| 25 |
+
)
|
| 26 |
+
if resp.status_code == 429:
|
| 27 |
+
print("[SemanticScholar] Rate limited β skipping, using arXiv only.")
|
| 28 |
+
return []
|
| 29 |
+
if resp.status_code != 200:
|
| 30 |
+
print(f"[SemanticScholar] search returned {resp.status_code} β skipping.")
|
| 31 |
+
return []
|
| 32 |
+
data = resp.json().get("data", []) or []
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"[SemanticScholar] search error: {e}")
|
| 35 |
+
return []
|
| 36 |
+
|
| 37 |
+
papers = []
|
| 38 |
+
for item in data:
|
| 39 |
+
try:
|
| 40 |
+
papers.append(Paper(
|
| 41 |
+
id=str(uuid.uuid4()),
|
| 42 |
+
session_id=session_id,
|
| 43 |
+
external_source=PaperSource.semantic_scholar,
|
| 44 |
+
source_paper_id=item.get("paperId", ""),
|
| 45 |
+
title=item.get("title") or "",
|
| 46 |
+
authors=[a.get("name", "") for a in (item.get("authors") or [])],
|
| 47 |
+
year=item.get("year"),
|
| 48 |
+
abstract=item.get("abstract") or "",
|
| 49 |
+
doi=item.get("externalIds", {}).get("DOI"),
|
| 50 |
+
venue=item.get("venue"),
|
| 51 |
+
citation_count=item.get("citationCount"),
|
| 52 |
+
pdf_url=(item.get("openAccessPdf") or {}).get("url"),
|
| 53 |
+
))
|
| 54 |
+
except Exception:
|
| 55 |
+
continue
|
| 56 |
+
return papers
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_references(paper_id: str, session_id: str, limit: int = 10) -> List[Paper]:
|
| 60 |
+
if not paper_id:
|
| 61 |
+
return []
|
| 62 |
+
try:
|
| 63 |
+
time.sleep(1) # Be polite to avoid 429
|
| 64 |
+
with httpx.Client(timeout=15) as client:
|
| 65 |
+
resp = client.get(
|
| 66 |
+
f"{BASE}/paper/{paper_id}/references",
|
| 67 |
+
params={"limit": limit, "fields": FIELDS},
|
| 68 |
+
headers=_headers(),
|
| 69 |
+
)
|
| 70 |
+
if resp.status_code != 200:
|
| 71 |
+
return []
|
| 72 |
+
data = resp.json().get("data", []) or []
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"[SemanticScholar] get_references error: {e}")
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
results = []
|
| 78 |
+
for item in (data or []):
|
| 79 |
+
cited = item.get("citedPaper") or {}
|
| 80 |
+
if not cited.get("title"):
|
| 81 |
+
continue
|
| 82 |
+
try:
|
| 83 |
+
results.append(Paper(
|
| 84 |
+
id=str(uuid.uuid4()),
|
| 85 |
+
session_id=session_id,
|
| 86 |
+
external_source=PaperSource.semantic_scholar,
|
| 87 |
+
source_paper_id=cited.get("paperId", ""),
|
| 88 |
+
title=cited.get("title", ""),
|
| 89 |
+
authors=[a.get("name", "") for a in (cited.get("authors") or [])],
|
| 90 |
+
year=cited.get("year"),
|
| 91 |
+
abstract=cited.get("abstract") or "",
|
| 92 |
+
))
|
| 93 |
+
except Exception:
|
| 94 |
+
continue
|
| 95 |
+
return results
|
app/workers/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ο»Ώ
|
app/workers/pipeline.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Research pipeline using FastAPI BackgroundTasks (no Redis/Celery required).
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
import uuid
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _update_status(db, session_id: str, status: str, extra: dict = {}):
|
| 10 |
+
try:
|
| 11 |
+
db.collection("research_sessions").document(session_id).update(
|
| 12 |
+
{"status": status, **extra}
|
| 13 |
+
)
|
| 14 |
+
print(f"[Pipeline] {session_id[:8]}... β {status}")
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print(f"[Pipeline] Failed to update status: {e}")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
async def run_research_pipeline(session_id: str, query: str, user_id: str):
|
| 20 |
+
"""Full async pipeline β called as a FastAPI background task."""
|
| 21 |
+
from ..core.firebase import get_db, init_firebase
|
| 22 |
+
from ..services.retrieval.query_decomposer import decompose_query
|
| 23 |
+
from ..services.retrieval import arxiv_adapter, semantic_scholar_adapter
|
| 24 |
+
from ..services.retrieval.ranker import rank_papers
|
| 25 |
+
from ..services.parsing.pdf_parser import parse_pdf
|
| 26 |
+
from ..services.extraction.extractor import extract_paper
|
| 27 |
+
from ..services.extraction.contradiction_detector import detect_contradictions
|
| 28 |
+
from ..services.graph.graph_builder import build_graph
|
| 29 |
+
from ..services.reporting.report_generator import generate_report
|
| 30 |
+
from ..services.council.council_runner import council_validate_report
|
| 31 |
+
from ..schemas.research import Claim, ClaimType
|
| 32 |
+
from ..core.config import get_settings
|
| 33 |
+
|
| 34 |
+
init_firebase()
|
| 35 |
+
db = get_db()
|
| 36 |
+
settings = get_settings()
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# ββ Stage 1: Retrieve papers ββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
_update_status(db, session_id, "retrieving")
|
| 41 |
+
query_plan = await decompose_query(query)
|
| 42 |
+
all_papers = []
|
| 43 |
+
|
| 44 |
+
for variant in (query_plan.query_variants or [query])[:2]:
|
| 45 |
+
# arXiv β always reliable
|
| 46 |
+
try:
|
| 47 |
+
all_papers += arxiv_adapter.search(variant, session_id, max_results=10)
|
| 48 |
+
print(f"[arXiv] fetched {len(all_papers)} papers for: {variant[:50]}")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"[arXiv] error: {e}")
|
| 51 |
+
|
| 52 |
+
# Semantic Scholar β skip gracefully if rate-limited
|
| 53 |
+
try:
|
| 54 |
+
ss = semantic_scholar_adapter.search(variant, session_id, limit=8)
|
| 55 |
+
all_papers += ss
|
| 56 |
+
print(f"[S2] fetched {len(ss)} papers")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"[S2] error: {e}")
|
| 59 |
+
|
| 60 |
+
# Deduplicate by title
|
| 61 |
+
seen: set[str] = set()
|
| 62 |
+
unique_papers = []
|
| 63 |
+
for p in all_papers:
|
| 64 |
+
key = (p.title or "").lower()[:60]
|
| 65 |
+
if key and key not in seen:
|
| 66 |
+
seen.add(key)
|
| 67 |
+
unique_papers.append(p)
|
| 68 |
+
|
| 69 |
+
print(f"[Pipeline] {len(unique_papers)} unique papers after dedup")
|
| 70 |
+
|
| 71 |
+
if not unique_papers:
|
| 72 |
+
_update_status(db, session_id, "failed", {"error": "No papers found. Try a different query."})
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
# ββ Stage 2: Rank βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
_update_status(db, session_id, "ranking")
|
| 77 |
+
try:
|
| 78 |
+
ranked = rank_papers(query, unique_papers, top_k=20)
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"[Ranker] error: {e} β using unranked")
|
| 81 |
+
ranked = unique_papers[:20]
|
| 82 |
+
|
| 83 |
+
# ββ Stage 3: Citation expansion βββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
_update_status(db, session_id, "expanding_citations")
|
| 85 |
+
for paper in ranked[:3]:
|
| 86 |
+
if paper.external_source == "semantic_scholar" and paper.source_paper_id:
|
| 87 |
+
try:
|
| 88 |
+
extras = semantic_scholar_adapter.get_references(
|
| 89 |
+
paper.source_paper_id, session_id, limit=5
|
| 90 |
+
) or []
|
| 91 |
+
for ep in extras:
|
| 92 |
+
key = (ep.title or "").lower()[:60]
|
| 93 |
+
if key and key not in seen:
|
| 94 |
+
seen.add(key)
|
| 95 |
+
ranked.append(ep)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(f"[Citations] error: {e}")
|
| 98 |
+
|
| 99 |
+
final_papers = ranked[:settings.max_papers_per_session]
|
| 100 |
+
_update_status(db, session_id, "extracting", {"paperCount": len(final_papers)})
|
| 101 |
+
|
| 102 |
+
# Save basic paper metadata immediately so UI can show them
|
| 103 |
+
for paper in final_papers:
|
| 104 |
+
try:
|
| 105 |
+
db.collection("papers").document(paper.id).set({
|
| 106 |
+
**paper.model_dump(),
|
| 107 |
+
"sessionId": session_id,
|
| 108 |
+
})
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f"[Firestore] paper save error: {e}")
|
| 111 |
+
|
| 112 |
+
# ββ Stage 4: PDF Parsing ββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββ
|
| 113 |
+
_update_status(db, session_id, "parsing_pdfs")
|
| 114 |
+
paper_sections: dict[str, dict] = {}
|
| 115 |
+
for paper in final_papers:
|
| 116 |
+
if paper.pdf_url:
|
| 117 |
+
try:
|
| 118 |
+
sections = parse_pdf(paper.pdf_url)
|
| 119 |
+
paper_sections[paper.id] = sections or {}
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"[PDF] parse error for {paper.title[:40]}: {e}")
|
| 122 |
+
paper_sections[paper.id] = {}
|
| 123 |
+
|
| 124 |
+
# ββ Stage 5: LLM Extraction βββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
_update_status(db, session_id, "extracting")
|
| 126 |
+
for paper in final_papers:
|
| 127 |
+
try:
|
| 128 |
+
sections = paper_sections.get(paper.id, {})
|
| 129 |
+
paper.extraction = await extract_paper(paper.title, paper.abstract, sections)
|
| 130 |
+
db.collection("papers").document(paper.id).update(
|
| 131 |
+
{"extraction": paper.extraction.model_dump()}
|
| 132 |
+
)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"[Extract] error for {paper.title[:40]}: {e}")
|
| 135 |
+
|
| 136 |
+
# ββ Stage 6: Graph + Contradictions βββββββββββββββββββββββββββββββββββ
|
| 137 |
+
_update_status(db, session_id, "building_graph")
|
| 138 |
+
claims: list[Claim] = []
|
| 139 |
+
for paper in final_papers:
|
| 140 |
+
if paper.extraction and paper.extraction.key_findings:
|
| 141 |
+
for finding in paper.extraction.key_findings[:2]:
|
| 142 |
+
claims.append(Claim(
|
| 143 |
+
id=str(uuid.uuid4()),
|
| 144 |
+
paper_id=paper.id,
|
| 145 |
+
paper_title=paper.title,
|
| 146 |
+
claim_text=finding,
|
| 147 |
+
claim_type=ClaimType.empirical,
|
| 148 |
+
evidence_span=finding,
|
| 149 |
+
confidence=0.75,
|
| 150 |
+
))
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
contradictions = await detect_contradictions(claims)
|
| 154 |
+
for c in contradictions:
|
| 155 |
+
c.session_id = session_id
|
| 156 |
+
db.collection("contradictions").document(c.id).set(c.model_dump())
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"[Contradictions] error: {e}")
|
| 159 |
+
contradictions = []
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
graph = build_graph(final_papers)
|
| 163 |
+
db.collection("graphs").document(session_id).set(graph.model_dump())
|
| 164 |
+
|
| 165 |
+
# ββ Persist to Neo4j AuraDB βββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
try:
|
| 167 |
+
from ..core.neo4j_client import get_driver
|
| 168 |
+
from ..services.graph.neo4j_writer import (
|
| 169 |
+
ensure_constraints, write_papers, write_contradictions,
|
| 170 |
+
)
|
| 171 |
+
ensure_constraints()
|
| 172 |
+
write_papers(final_papers, session_id)
|
| 173 |
+
write_contradictions(contradictions)
|
| 174 |
+
print(f"[Neo4j] Knowledge graph persisted for session {session_id}.")
|
| 175 |
+
except Exception as neo4j_err:
|
| 176 |
+
print(f"[Neo4j] Non-fatal β could not write to AuraDB: {neo4j_err}")
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"[Graph] error: {e}")
|
| 179 |
+
|
| 180 |
+
# ββ Stage 7: Report βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
+
_update_status(db, session_id, "generating_report")
|
| 182 |
+
try:
|
| 183 |
+
report = await generate_report(query, final_papers, session_id)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"[Report] error: {e}")
|
| 186 |
+
from ..schemas.research import Report
|
| 187 |
+
report = Report(
|
| 188 |
+
id=str(uuid.uuid4()),
|
| 189 |
+
session_id=session_id,
|
| 190 |
+
report_markdown=f"# Literature Review: {query}\n\nReport generation failed: {e}",
|
| 191 |
+
confidence_score=0.0,
|
| 192 |
+
created_at=datetime.now(timezone.utc).isoformat(),
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# ββ Stage 8: LLM Council ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
_update_status(db, session_id, "council_review")
|
| 197 |
+
try:
|
| 198 |
+
papers_summary = "\n".join(
|
| 199 |
+
f"- {p.title}: {p.extraction.summary if p.extraction else ''}"
|
| 200 |
+
for p in final_papers[:8]
|
| 201 |
+
)
|
| 202 |
+
council_notes = await council_validate_report(report.report_markdown, papers_summary)
|
| 203 |
+
report.report_markdown += (
|
| 204 |
+
"\n\n---\n## LLM Council Notes\n"
|
| 205 |
+
f"**Extractor:** {council_notes['extractor_notes'][:300]}\n\n"
|
| 206 |
+
f"**Skeptic:** {council_notes['skeptic_notes'][:300]}\n\n"
|
| 207 |
+
f"**Synthesizer:** {council_notes['synthesizer_notes'][:300]}"
|
| 208 |
+
)
|
| 209 |
+
except Exception as e:
|
| 210 |
+
print(f"[Council] error: {e}")
|
| 211 |
+
|
| 212 |
+
db.collection("reports").document(report.id).set(report.model_dump())
|
| 213 |
+
|
| 214 |
+
# ββ Done ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 215 |
+
_update_status(db, session_id, "completed", {
|
| 216 |
+
"completedAt": datetime.now(timezone.utc).isoformat(),
|
| 217 |
+
"paperCount": len(final_papers),
|
| 218 |
+
})
|
| 219 |
+
print(f"[Pipeline] COMPLETE β {len(final_papers)} papers, session {session_id[:8]}")
|
| 220 |
+
|
| 221 |
+
except Exception as exc:
|
| 222 |
+
print(f"[Pipeline] FATAL: {exc}")
|
| 223 |
+
try:
|
| 224 |
+
_update_status(db, session_id, "failed", {"error": str(exc)})
|
| 225 |
+
except Exception:
|
| 226 |
+
pass
|
| 227 |
+
raise
|
requirements.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web framework
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn[standard]>=0.32.0
|
| 4 |
+
python-multipart>=0.0.12
|
| 5 |
+
|
| 6 |
+
# Firebase Admin (replaces PostgreSQL)
|
| 7 |
+
firebase-admin>=6.6.0
|
| 8 |
+
|
| 9 |
+
# Background tasks (FastAPI BackgroundTasks β no Redis required)
|
| 10 |
+
|
| 11 |
+
# LLM adapters
|
| 12 |
+
groq>=0.13.0
|
| 13 |
+
openai>=1.58.0
|
| 14 |
+
anthropic>=0.40.0
|
| 15 |
+
google-generativeai>=0.8.3
|
| 16 |
+
|
| 17 |
+
# PDF parsing
|
| 18 |
+
PyMuPDF>=1.24.14
|
| 19 |
+
pdfplumber>=0.11.4
|
| 20 |
+
|
| 21 |
+
# NLP / Embeddings
|
| 22 |
+
sentence-transformers>=3.3.1
|
| 23 |
+
scikit-learn>=1.5.2
|
| 24 |
+
spacy>=3.8.3
|
| 25 |
+
|
| 26 |
+
# Graph
|
| 27 |
+
networkx>=3.4.2
|
| 28 |
+
|
| 29 |
+
# Academic APIs
|
| 30 |
+
arxiv>=2.1.3
|
| 31 |
+
requests>=2.32.3
|
| 32 |
+
httpx>=0.28.1
|
| 33 |
+
|
| 34 |
+
# Validation
|
| 35 |
+
pydantic>=2.10.3
|
| 36 |
+
pydantic-settings>=2.6.1
|
| 37 |
+
|
| 38 |
+
# Utilities
|
| 39 |
+
python-dotenv>=1.0.1
|
| 40 |
+
tenacity>=9.0.0
|