Vansh180 commited on
Commit
7dd9eed
Β·
0 Parent(s):

feat: full-stack LitAgent MVP with Neo4j knowledge graph integration

Browse files

- FastAPI backend: pipeline (arXiv + Semantic Scholar), extraction, contradiction detection, report generation, LLM Council
- Firebase Auth + Firestore for sessions, papers, reports
- Neo4j AuraDB: writes Paper/Author/Method/Dataset nodes and CITES/USES/CONTRADICTS edges per session
- Next.js frontend: dashboard, session page with stepper, tabs (papers, report, graph, contradictions, gaps)
- ReactFlow knowledge graph viewer with stable nodeTypes and memoized nodes/edges
- Graceful 404 handling on report/graph tabs with auto-fetch on session completion
- Groq (llama-3.3-70b-versatile) as default LLM provider

Made-with: Cursor

app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/api/research.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
2
+ from ..schemas.research import (
3
+ ResearchQueryRequest, ResearchQueryResponse, SessionStatusResponse,
4
+ )
5
+ from ..core.auth import get_current_user
6
+ from ..core.firebase import get_db
7
+ from ..workers.pipeline import run_research_pipeline
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+
11
+ router = APIRouter(prefix="/research", tags=["research"])
12
+
13
+
14
+ @router.post("/query", response_model=ResearchQueryResponse)
15
+ async def submit_query(
16
+ body: ResearchQueryRequest,
17
+ background_tasks: BackgroundTasks,
18
+ user: dict = Depends(get_current_user),
19
+ ):
20
+ db = get_db()
21
+ session_id = str(uuid.uuid4())
22
+ now = datetime.now(timezone.utc).isoformat()
23
+
24
+ db.collection("research_sessions").document(session_id).set({
25
+ "userId": user["uid"],
26
+ "query": body.query,
27
+ "status": "accepted",
28
+ "createdAt": now,
29
+ "paperCount": 0,
30
+ })
31
+
32
+ background_tasks.add_task(run_research_pipeline, session_id, body.query, user["uid"])
33
+
34
+ return ResearchQueryResponse(session_id=session_id)
35
+
36
+
37
+ @router.get("/{session_id}/status", response_model=SessionStatusResponse)
38
+ async def get_status(
39
+ session_id: str,
40
+ user: dict = Depends(get_current_user),
41
+ ):
42
+ db = get_db()
43
+ doc = db.collection("research_sessions").document(session_id).get()
44
+ if not doc.exists:
45
+ raise HTTPException(status_code=404, detail="Session not found")
46
+ data = doc.to_dict()
47
+ if data.get("userId") != user["uid"]:
48
+ raise HTTPException(status_code=403, detail="Forbidden")
49
+ return SessionStatusResponse(
50
+ session_id=session_id,
51
+ status=data["status"],
52
+ paper_count=data.get("paperCount"),
53
+ )
54
+
55
+
56
+ @router.get("/{session_id}/papers")
57
+ async def get_papers(session_id: str, user: dict = Depends(get_current_user)):
58
+ db = get_db()
59
+ docs = db.collection("papers").where("sessionId", "==", session_id).stream()
60
+ papers = [{"id": d.id, **d.to_dict()} for d in docs]
61
+ return {"papers": papers}
62
+
63
+
64
+ @router.get("/{session_id}/report")
65
+ async def get_report(session_id: str, user: dict = Depends(get_current_user)):
66
+ db = get_db()
67
+ docs = db.collection("reports").where("sessionId", "==", session_id).limit(1).stream()
68
+ report = next(({"id": d.id, **d.to_dict()} for d in docs), None)
69
+ if not report:
70
+ raise HTTPException(status_code=404, detail="Report not ready yet")
71
+ return {"report": report}
72
+
73
+
74
+ @router.get("/{session_id}/graph")
75
+ async def get_graph(session_id: str, user: dict = Depends(get_current_user)):
76
+ db = get_db()
77
+ doc = db.collection("graphs").document(session_id).get()
78
+ if not doc.exists:
79
+ raise HTTPException(status_code=404, detail="Graph not ready yet")
80
+ return {"graph": doc.to_dict()}
81
+
82
+
83
+ @router.get("/{session_id}/neo4j-graph")
84
+ async def get_neo4j_graph(session_id: str, user: dict = Depends(get_current_user)):
85
+ """Returns the knowledge graph directly from Neo4j AuraDB."""
86
+ try:
87
+ from ..services.graph.neo4j_writer import get_graph_for_session
88
+ data = get_graph_for_session(session_id)
89
+ return {"graph": data}
90
+ except RuntimeError as e:
91
+ raise HTTPException(status_code=503, detail=str(e))
92
+ except Exception as e:
93
+ raise HTTPException(status_code=500, detail=f"Neo4j query failed: {e}")
94
+
95
+
96
+ @router.get("/{session_id}/contradictions")
97
+ async def get_contradictions(session_id: str, user: dict = Depends(get_current_user)):
98
+ db = get_db()
99
+ docs = db.collection("contradictions").where("sessionId", "==", session_id).stream()
100
+ return {"contradictions": [{"id": d.id, **d.to_dict()} for d in docs]}
app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/core/auth.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ from fastapi import Depends, HTTPException, status
4
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
+
6
+ bearer_scheme = HTTPBearer(auto_error=False)
7
+
8
+
9
+ def _decode_jwt_payload(token: str) -> dict:
10
+ """
11
+ Decode Firebase JWT payload without network verification.
12
+ Firebase tokens are standard JWTs β€” we extract uid from the payload.
13
+ """
14
+ try:
15
+ payload_part = token.split('.')[1]
16
+ # Fix base64 padding
17
+ payload_part += '=' * (4 - len(payload_part) % 4)
18
+ decoded = base64.urlsafe_b64decode(payload_part)
19
+ return json.loads(decoded)
20
+ except Exception as e:
21
+ raise HTTPException(
22
+ status_code=status.HTTP_401_UNAUTHORIZED,
23
+ detail=f"Could not decode token: {e}"
24
+ )
25
+
26
+
27
+ def get_current_user(
28
+ credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme),
29
+ ) -> dict:
30
+ if not credentials or not credentials.credentials:
31
+ raise HTTPException(
32
+ status_code=status.HTTP_401_UNAUTHORIZED,
33
+ detail="Not authenticated β€” please log in"
34
+ )
35
+
36
+ payload = _decode_jwt_payload(credentials.credentials)
37
+
38
+ uid = payload.get("user_id") or payload.get("sub")
39
+ if not uid:
40
+ raise HTTPException(
41
+ status_code=status.HTTP_401_UNAUTHORIZED,
42
+ detail="Token missing user ID"
43
+ )
44
+
45
+ return {"uid": uid, "email": payload.get("email", "")}
app/core/config.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from functools import lru_cache
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ # Neo4j AuraDB
7
+ neo4j_uri: str = ""
8
+ neo4j_username: str = ""
9
+ neo4j_password: str = ""
10
+
11
+ # Firebase
12
+ firebase_credentials_path: str = "serviceAccountKey.json"
13
+ firebase_project_id: str = ""
14
+
15
+ # LLM
16
+ groq_api_key: str = ""
17
+ openai_api_key: str = ""
18
+ anthropic_api_key: str = ""
19
+ gemini_api_key: str = ""
20
+ default_llm: str = "groq" # groq | openai | anthropic | gemini
21
+
22
+ # Academic APIs
23
+ semantic_scholar_api_key: str = ""
24
+ pubmed_email: str = ""
25
+
26
+ # Celery / Redis
27
+ redis_url: str = "redis://localhost:6379/0"
28
+ celery_broker_url: str = "redis://localhost:6379/0"
29
+ celery_result_backend: str = "redis://localhost:6379/1"
30
+
31
+ # Pipeline limits
32
+ max_papers_per_session: int = 50
33
+ max_citation_depth: int = 1
34
+ max_expansion_per_paper: int = 10
35
+
36
+ class Config:
37
+ env_file = ".env"
38
+
39
+
40
+ @lru_cache
41
+ def get_settings() -> Settings:
42
+ return Settings()
app/core/firebase.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Fix gRPC DNS resolution on Windows before any grpc import
4
+ os.environ.setdefault("GRPC_DNS_RESOLVER", "native")
5
+
6
+ import firebase_admin
7
+ from firebase_admin import credentials, firestore, auth
8
+ from .config import get_settings
9
+
10
+ _initialized = False
11
+ _db = None
12
+
13
+
14
+ def init_firebase():
15
+ global _initialized
16
+ if _initialized or firebase_admin._apps:
17
+ _initialized = True
18
+ return
19
+
20
+ settings = get_settings()
21
+ cred_path = settings.firebase_credentials_path
22
+
23
+ if not os.path.exists(cred_path):
24
+ print(
25
+ f"[Firebase] WARNING: '{cred_path}' not found. "
26
+ "Download from Firebase Console > Project Settings > Service Accounts."
27
+ )
28
+ return
29
+
30
+ cred = credentials.Certificate(cred_path)
31
+ firebase_admin.initialize_app(cred)
32
+ _initialized = True
33
+ print("[Firebase] Initialised successfully.")
34
+
35
+
36
+ def get_db():
37
+ global _db
38
+ if _db is not None:
39
+ return _db
40
+
41
+ if not firebase_admin._apps:
42
+ init_firebase()
43
+ if not firebase_admin._apps:
44
+ raise RuntimeError(
45
+ "Firebase is not initialised. "
46
+ "Place serviceAccountKey.json in the backend/ folder and restart."
47
+ )
48
+
49
+ _db = firestore.client()
50
+ print("[Firebase] Firestore client ready.")
51
+ return _db
52
+
53
+
54
+ def verify_token(token: str) -> dict:
55
+ if not firebase_admin._apps:
56
+ init_firebase()
57
+ if not firebase_admin._apps:
58
+ raise RuntimeError("Firebase is not initialised.")
59
+ return auth.verify_id_token(token)
app/core/llm.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralised LLM client.
3
+ Supports: groq | openai | anthropic
4
+ All services should call `chat_completion()` instead of importing SDK clients directly.
5
+ """
6
+ import json
7
+ from typing import Any
8
+ from .config import get_settings
9
+
10
+ # Groq model to use β€” fast and capable for structured extraction tasks
11
+ GROQ_MODEL = "llama-3.3-70b-versatile"
12
+ OPENAI_MODEL = "gpt-4o-mini"
13
+ CLAUDE_MODEL = "claude-3-haiku-20240307"
14
+
15
+
16
+ async def chat_completion(
17
+ system: str,
18
+ user: str,
19
+ json_mode: bool = False,
20
+ temperature: float = 0,
21
+ ) -> str:
22
+ """
23
+ Single entry point for all LLM calls.
24
+ Returns the raw string content of the assistant reply.
25
+ """
26
+ settings = get_settings()
27
+ provider = settings.default_llm
28
+
29
+ if provider == "groq":
30
+ return await _groq(system, user, json_mode, temperature, settings)
31
+ if provider == "openai":
32
+ return await _openai(system, user, json_mode, temperature, settings)
33
+ if provider == "anthropic":
34
+ return await _anthropic(system, user, temperature, settings)
35
+
36
+ raise ValueError(f"Unknown LLM provider: {provider}")
37
+
38
+
39
+ async def _groq(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
40
+ from groq import AsyncGroq
41
+ client = AsyncGroq(api_key=settings.groq_api_key)
42
+
43
+ kwargs: dict[str, Any] = {
44
+ "model": GROQ_MODEL,
45
+ "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
46
+ "temperature": temperature,
47
+ }
48
+ if json_mode:
49
+ kwargs["response_format"] = {"type": "json_object"}
50
+
51
+ resp = await client.chat.completions.create(**kwargs)
52
+ return resp.choices[0].message.content or ""
53
+
54
+
55
+ async def _openai(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
56
+ from openai import AsyncOpenAI
57
+ client = AsyncOpenAI(api_key=settings.openai_api_key)
58
+
59
+ kwargs: dict[str, Any] = {
60
+ "model": OPENAI_MODEL,
61
+ "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}],
62
+ "temperature": temperature,
63
+ }
64
+ if json_mode:
65
+ kwargs["response_format"] = {"type": "json_object"}
66
+
67
+ resp = await client.chat.completions.create(**kwargs)
68
+ return resp.choices[0].message.content or ""
69
+
70
+
71
+ async def _anthropic(system: str, user: str, temperature: float, settings) -> str:
72
+ import anthropic
73
+ client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
74
+ msg = await client.messages.create(
75
+ model=CLAUDE_MODEL,
76
+ max_tokens=1024,
77
+ system=system,
78
+ messages=[{"role": "user", "content": user}],
79
+ temperature=temperature,
80
+ )
81
+ return msg.content[0].text
82
+
83
+
84
+ async def chat_json(system: str, user: str) -> dict:
85
+ """Convenience wrapper β€” returns parsed JSON dict."""
86
+ raw = await chat_completion(system, user, json_mode=True, temperature=0)
87
+ return json.loads(raw)
app/core/neo4j_client.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Neo4j AuraDB client β€” singleton driver with helper methods.
3
+ """
4
+ from neo4j import GraphDatabase, Driver
5
+ from .config import get_settings
6
+
7
+ _driver: Driver | None = None
8
+
9
+
10
+ def get_driver() -> Driver:
11
+ global _driver
12
+ if _driver is not None:
13
+ return _driver
14
+
15
+ settings = get_settings()
16
+ if not settings.neo4j_uri or not settings.neo4j_password:
17
+ raise RuntimeError(
18
+ "Neo4j not configured. Set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD in .env"
19
+ )
20
+
21
+ _driver = GraphDatabase.driver(
22
+ settings.neo4j_uri,
23
+ auth=(settings.neo4j_username, settings.neo4j_password),
24
+ )
25
+ _driver.verify_connectivity()
26
+ print("[Neo4j] Connected to AuraDB successfully.")
27
+ return _driver
28
+
29
+
30
+ def close_driver():
31
+ global _driver
32
+ if _driver:
33
+ _driver.close()
34
+ _driver = None
app/main.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Must be set before any grpc/firebase import to fix Windows DNS resolution
3
+ os.environ["GRPC_DNS_RESOLVER"] = "native"
4
+
5
+ from fastapi import FastAPI
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from .core.firebase import init_firebase
8
+ from .api import research
9
+
10
+ app = FastAPI(
11
+ title="LitAgent API",
12
+ description="Autonomous Research Literature Agent backend",
13
+ version="1.0.0",
14
+ )
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["http://localhost:3000", "https://your-production-domain.com"],
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ app.include_router(research.router)
25
+
26
+ @app.on_event("startup")
27
+ async def startup():
28
+ init_firebase()
29
+
30
+ @app.get("/health")
31
+ async def health():
32
+ return {"status": "ok", "service": "litagent-api"}
app/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/schemas/research.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, List
3
+ from enum import Enum
4
+
5
+
6
+ class SessionStatus(str, Enum):
7
+ accepted = "accepted"
8
+ retrieving = "retrieving"
9
+ ranking = "ranking"
10
+ expanding_citations = "expanding_citations"
11
+ parsing_pdfs = "parsing_pdfs"
12
+ extracting = "extracting"
13
+ building_graph = "building_graph"
14
+ generating_report = "generating_report"
15
+ council_review = "council_review"
16
+ completed = "completed"
17
+ failed = "failed"
18
+
19
+
20
+ class ResearchQueryRequest(BaseModel):
21
+ query: str = Field(..., min_length=10, max_length=1000)
22
+
23
+
24
+ class ResearchQueryResponse(BaseModel):
25
+ session_id: str
26
+ status: SessionStatus = SessionStatus.accepted
27
+
28
+
29
+ class SessionStatusResponse(BaseModel):
30
+ session_id: str
31
+ status: SessionStatus
32
+ step: Optional[str] = None
33
+ paper_count: Optional[int] = None
34
+
35
+
36
+ class PaperSource(str, Enum):
37
+ arxiv = "arxiv"
38
+ pubmed = "pubmed"
39
+ semantic_scholar = "semantic_scholar"
40
+ openalex = "openalex"
41
+ crossref = "crossref"
42
+
43
+
44
+ class PaperExtraction(BaseModel):
45
+ objective: str = ""
46
+ methodology: str = ""
47
+ datasets: List[str] = []
48
+ metrics: List[str] = []
49
+ key_findings: List[str] = []
50
+ limitations: List[str] = []
51
+ future_work: List[str] = []
52
+ summary: str = ""
53
+
54
+
55
+ class Paper(BaseModel):
56
+ id: str
57
+ session_id: str
58
+ external_source: PaperSource
59
+ source_paper_id: str
60
+ title: str
61
+ authors: List[str] = []
62
+ year: Optional[int] = None
63
+ abstract: str = ""
64
+ doi: Optional[str] = None
65
+ venue: Optional[str] = None
66
+ citation_count: Optional[int] = None
67
+ pdf_url: Optional[str] = None
68
+ relevance_score: Optional[float] = None
69
+ extraction: Optional[PaperExtraction] = None
70
+
71
+
72
+ class ClaimType(str, Enum):
73
+ performance = "performance"
74
+ methodological = "methodological"
75
+ theoretical = "theoretical"
76
+ empirical = "empirical"
77
+ limitation = "limitation"
78
+
79
+
80
+ class Claim(BaseModel):
81
+ id: str
82
+ paper_id: str
83
+ paper_title: str
84
+ claim_text: str
85
+ claim_type: ClaimType
86
+ evidence_span: str = ""
87
+ confidence: float = 0.0
88
+ entities: List[str] = []
89
+
90
+
91
+ class ContradictionSeverity(str, Enum):
92
+ direct = "direct"
93
+ partial = "partial"
94
+ contextual = "contextual"
95
+
96
+
97
+ class Contradiction(BaseModel):
98
+ id: str
99
+ session_id: str
100
+ claim_a: Claim
101
+ claim_b: Claim
102
+ severity: ContradictionSeverity
103
+ explanation: str
104
+ reason: Optional[str] = None
105
+
106
+
107
+ class ResearchGap(BaseModel):
108
+ id: str
109
+ session_id: str
110
+ title: str
111
+ description: str
112
+ category: str
113
+ recurrence_count: int = 0
114
+ importance_score: float = 0.0
115
+ source_paper_ids: List[str] = []
116
+ derivation_note: str = ""
117
+
118
+
119
+ class GraphNode(BaseModel):
120
+ id: str
121
+ label: str
122
+ type: str
123
+ data: dict = {}
124
+
125
+
126
+ class GraphEdge(BaseModel):
127
+ id: str
128
+ source: str
129
+ target: str
130
+ type: str
131
+ label: Optional[str] = None
132
+
133
+
134
+ class KnowledgeGraph(BaseModel):
135
+ nodes: List[GraphNode] = []
136
+ edges: List[GraphEdge] = []
137
+
138
+
139
+ class Report(BaseModel):
140
+ id: str
141
+ session_id: str
142
+ report_markdown: str
143
+ executive_summary: str = ""
144
+ trend_overview: str = ""
145
+ methodology_comparison: str = ""
146
+ contradiction_summary: str = ""
147
+ research_gaps: List[ResearchGap] = []
148
+ suggested_directions: List[str] = []
149
+ confidence_score: float = 0.0
150
+ created_at: str
app/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/council/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/council/council_runner.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Council: five role-based agents validate and enrich the synthesis.
3
+ Roles: Extractor, Skeptic, Synthesizer, Contradiction Judge, Gap Prioritizer.
4
+ """
5
+ from typing import Any
6
+ from ...core.llm import chat_completion
7
+
8
+ ROLES = {
9
+ "extractor": (
10
+ "You are the Extractor. Confirm what the papers claim: their methods, "
11
+ "results, and limitations. Be factual and concise."
12
+ ),
13
+ "skeptic": (
14
+ "You are the Skeptic. Identify weaknesses, unsupported conclusions, "
15
+ "potential confounders, and overgeneralizations in the paper claims."
16
+ ),
17
+ "synthesizer": (
18
+ "You are the Synthesizer. Identify cross-paper patterns, shared trends, "
19
+ "and broader field-level insights from the provided summaries."
20
+ ),
21
+ "contradiction_judge": (
22
+ "You are the Contradiction Judge. Assess whether two claims genuinely "
23
+ "conflict or merely differ in scope/setup."
24
+ ),
25
+ "gap_prioritizer": (
26
+ "You are the Gap Prioritizer. Rank the identified research gaps by "
27
+ "importance, feasibility, and potential impact."
28
+ ),
29
+ }
30
+
31
+
32
+ async def run_role(role: str, content: str) -> str:
33
+ system = ROLES.get(role, "You are a helpful research assistant.")
34
+ try:
35
+ return await chat_completion(system, content[:2000], temperature=0.2)
36
+ except Exception as e:
37
+ return f"[Council role '{role}' unavailable: {e}]"
38
+
39
+
40
+ async def council_validate_report(report_markdown: str, papers_summary: str) -> dict[str, Any]:
41
+ extractor_out = await run_role("extractor", papers_summary)
42
+ skeptic_out = await run_role("skeptic", papers_summary)
43
+ synthesizer_out = await run_role("synthesizer", papers_summary)
44
+
45
+ return {
46
+ "extractor_notes": extractor_out,
47
+ "skeptic_notes": skeptic_out,
48
+ "synthesizer_notes": synthesizer_out,
49
+ }
app/services/extraction/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/extraction/contradiction_detector.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Detects contradictions between extracted claims using embeddings + LLM judgment.
3
+ """
4
+ import uuid
5
+ from typing import List
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from ...schemas.research import Claim, Contradiction, ContradictionSeverity
8
+ from ...core.llm import chat_json
9
+
10
+ _model = None
11
+
12
+ JUDGE_PROMPT = """You are a scientific contradiction detector.
13
+ Given two claims from different papers, determine if they contradict each other.
14
+ Return JSON with keys:
15
+ - verdict: "contradicts" | "partially_contradicts" | "supports" | "inconclusive"
16
+ - explanation: string (1-2 sentences)
17
+ - reason: string (brief likely cause of conflict, if any)"""
18
+
19
+
20
+ def _get_model():
21
+ global _model
22
+ if _model is None:
23
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
24
+ return _model
25
+
26
+
27
+ async def detect_contradictions(claims: List[Claim]) -> List[Contradiction]:
28
+ if len(claims) < 2:
29
+ return []
30
+
31
+ model = _get_model()
32
+ texts = [c.claim_text for c in claims]
33
+ embs = model.encode(texts, convert_to_tensor=True)
34
+ sims = util.cos_sim(embs, embs)
35
+
36
+ candidates: list[tuple[Claim, Claim]] = []
37
+ for i in range(len(claims)):
38
+ for j in range(i + 1, len(claims)):
39
+ if claims[i].paper_id == claims[j].paper_id:
40
+ continue
41
+ if float(sims[i][j]) > 0.55:
42
+ candidates.append((claims[i], claims[j]))
43
+
44
+ contradictions: List[Contradiction] = []
45
+
46
+ for ca, cb in candidates[:20]:
47
+ verdict_data = await _judge_pair(ca, cb)
48
+ if verdict_data.get("verdict") in ("contradicts", "partially_contradicts"):
49
+ severity = (
50
+ ContradictionSeverity.direct
51
+ if verdict_data["verdict"] == "contradicts"
52
+ else ContradictionSeverity.partial
53
+ )
54
+ contradictions.append(Contradiction(
55
+ id=str(uuid.uuid4()),
56
+ session_id="",
57
+ claim_a=ca,
58
+ claim_b=cb,
59
+ severity=severity,
60
+ explanation=verdict_data.get("explanation", ""),
61
+ reason=verdict_data.get("reason"),
62
+ ))
63
+
64
+ return contradictions
65
+
66
+
67
+ async def _judge_pair(ca: Claim, cb: Claim) -> dict:
68
+ text = (
69
+ f"Claim A (from: {ca.paper_title}):\n{ca.claim_text}\n\n"
70
+ f"Claim B (from: {cb.paper_title}):\n{cb.claim_text}"
71
+ )
72
+ try:
73
+ return await chat_json(JUDGE_PROMPT, text)
74
+ except Exception:
75
+ return {"verdict": "inconclusive", "explanation": "", "reason": ""}
app/services/extraction/extractor.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Structured information extraction from paper text using LLM + Pydantic validation.
3
+ """
4
+ from ...schemas.research import PaperExtraction
5
+ from ...core.llm import chat_json
6
+
7
+ SYSTEM_PROMPT = """You are a scientific paper analyst. Given the text of an academic paper,
8
+ extract structured information and return it as valid JSON with these exact keys:
9
+ - objective: string (the main research goal)
10
+ - methodology: string (the approach/method used)
11
+ - datasets: list of strings (dataset names used)
12
+ - metrics: list of strings (evaluation metrics)
13
+ - key_findings: list of strings (3-5 main results/conclusions)
14
+ - limitations: list of strings (stated or implied limitations)
15
+ - future_work: list of strings (suggested future directions)
16
+ - summary: string (2-3 sentence plain English summary)
17
+ Be concise and factual. Do not hallucinate."""
18
+
19
+
20
+ async def extract_paper(title: str, abstract: str, sections: dict[str, str]) -> PaperExtraction:
21
+ text_parts = [f"Title: {title}", f"Abstract: {abstract}"]
22
+ for key in ["introduction", "methodology", "methods", "results", "conclusion", "limitations"]:
23
+ for section_name, content in sections.items():
24
+ if key in section_name.lower() and content:
25
+ text_parts.append(f"\n## {section_name.title()}\n{content[:1500]}")
26
+ break
27
+
28
+ text = "\n".join(text_parts)[:4000]
29
+
30
+ try:
31
+ data = await chat_json(SYSTEM_PROMPT, text)
32
+ return PaperExtraction(**{k: data.get(k, v) for k, v in PaperExtraction().model_dump().items()})
33
+ except Exception:
34
+ return PaperExtraction(
35
+ objective=abstract[:200],
36
+ summary=abstract[:300],
37
+ )
app/services/graph/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/graph/graph_builder.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Builds a knowledge graph from papers using networkx, serializes to JSON for frontend.
3
+ """
4
+ import uuid
5
+ import networkx as nx
6
+ from typing import List
7
+ from ...schemas.research import Paper, KnowledgeGraph, GraphNode, GraphEdge
8
+
9
+
10
+ def build_graph(papers: List[Paper]) -> KnowledgeGraph:
11
+ G = nx.DiGraph()
12
+ nodes: List[GraphNode] = []
13
+ edges: List[GraphEdge] = []
14
+ seen_methods: dict[str, str] = {}
15
+ seen_datasets: dict[str, str] = {}
16
+ seen_topics: dict[str, str] = {}
17
+
18
+ for paper in papers:
19
+ # Paper node
20
+ G.add_node(paper.id, type="Paper", label=paper.title[:50])
21
+ nodes.append(GraphNode(id=paper.id, label=paper.title[:50], type="Paper"))
22
+
23
+ if not paper.extraction:
24
+ continue
25
+
26
+ # Method nodes
27
+ if paper.extraction.methodology:
28
+ method_key = paper.extraction.methodology[:40].lower()
29
+ if method_key not in seen_methods:
30
+ mid = str(uuid.uuid4())
31
+ seen_methods[method_key] = mid
32
+ G.add_node(mid, type="Method", label=paper.extraction.methodology[:40])
33
+ nodes.append(GraphNode(id=mid, label=paper.extraction.methodology[:40], type="Method"))
34
+ edges.append(GraphEdge(
35
+ id=str(uuid.uuid4()), source=paper.id,
36
+ target=seen_methods[method_key], type="USES",
37
+ ))
38
+
39
+ # Dataset nodes
40
+ for ds in (paper.extraction.datasets or [])[:3]:
41
+ dk = ds.lower()
42
+ if dk not in seen_datasets:
43
+ did = str(uuid.uuid4())
44
+ seen_datasets[dk] = did
45
+ G.add_node(did, type="Dataset", label=ds)
46
+ nodes.append(GraphNode(id=did, label=ds, type="Dataset"))
47
+ edges.append(GraphEdge(
48
+ id=str(uuid.uuid4()), source=paper.id,
49
+ target=seen_datasets[dk], type="TESTED_ON",
50
+ ))
51
+
52
+ # Author node (first author)
53
+ if paper.authors:
54
+ author = paper.authors[0]
55
+ aid = f"author_{author.replace(' ', '_').lower()}"
56
+ if not G.has_node(aid):
57
+ G.add_node(aid, type="Author", label=author)
58
+ nodes.append(GraphNode(id=aid, label=author, type="Author"))
59
+ edges.append(GraphEdge(
60
+ id=str(uuid.uuid4()), source=paper.id,
61
+ target=aid, type="WRITTEN_BY",
62
+ ))
63
+
64
+ return KnowledgeGraph(nodes=nodes, edges=edges)
app/services/graph/neo4j_writer.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Writes the knowledge graph to Neo4j AuraDB.
3
+
4
+ Node labels: Paper | Author | Method | Dataset | Claim | Topic
5
+ Relationships: CITES | USES | TESTED_ON | SUPPORTS | CONTRADICTS | WRITTEN_BY | RELATED_TO
6
+ """
7
+ from typing import List
8
+ from ...schemas.research import Paper, Contradiction
9
+ from ...core.neo4j_client import get_driver
10
+
11
+
12
+ # ── Schema constraints (run once) ────────────────────────────────────────────
13
+
14
+ CONSTRAINTS = [
15
+ "CREATE CONSTRAINT paper_id IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE",
16
+ "CREATE CONSTRAINT author_name IF NOT EXISTS FOR (a:Author) REQUIRE a.name IS UNIQUE",
17
+ "CREATE CONSTRAINT method_name IF NOT EXISTS FOR (m:Method) REQUIRE m.name IS UNIQUE",
18
+ "CREATE CONSTRAINT dataset_name IF NOT EXISTS FOR (d:Dataset) REQUIRE d.name IS UNIQUE",
19
+ ]
20
+
21
+
22
+ def ensure_constraints():
23
+ driver = get_driver()
24
+ with driver.session() as session:
25
+ for constraint in CONSTRAINTS:
26
+ try:
27
+ session.run(constraint)
28
+ except Exception:
29
+ pass # Already exists
30
+
31
+
32
+ # ── Write papers ─────────────────────────────────────────────────────────────
33
+
34
+ def write_papers(papers: List[Paper], session_id: str):
35
+ driver = get_driver()
36
+
37
+ with driver.session() as session:
38
+ for paper in papers:
39
+ # Create Paper node
40
+ session.run(
41
+ """
42
+ MERGE (p:Paper {id: $id})
43
+ SET p.title = $title,
44
+ p.year = $year,
45
+ p.abstract = $abstract,
46
+ p.venue = $venue,
47
+ p.citationCount= $citation_count,
48
+ p.sessionId = $session_id,
49
+ p.source = $source
50
+ """,
51
+ id = paper.id,
52
+ title = paper.title,
53
+ year = paper.year,
54
+ abstract = (paper.abstract or "")[:500],
55
+ venue = paper.venue or "",
56
+ citation_count= paper.citation_count or 0,
57
+ session_id = session_id,
58
+ source = paper.external_source,
59
+ )
60
+
61
+ # Author nodes + WRITTEN_BY edges
62
+ for author_name in (paper.authors or [])[:3]:
63
+ if not author_name:
64
+ continue
65
+ session.run(
66
+ """
67
+ MERGE (a:Author {name: $name})
68
+ WITH a
69
+ MATCH (p:Paper {id: $paper_id})
70
+ MERGE (p)-[:WRITTEN_BY]->(a)
71
+ """,
72
+ name=author_name, paper_id=paper.id,
73
+ )
74
+
75
+ if not paper.extraction:
76
+ continue
77
+
78
+ # Method node + USES edge
79
+ if paper.extraction.methodology:
80
+ method_name = paper.extraction.methodology[:80]
81
+ session.run(
82
+ """
83
+ MERGE (m:Method {name: $name})
84
+ WITH m
85
+ MATCH (p:Paper {id: $paper_id})
86
+ MERGE (p)-[:USES]->(m)
87
+ """,
88
+ name=method_name, paper_id=paper.id,
89
+ )
90
+
91
+ # Dataset nodes + TESTED_ON edges
92
+ for ds in (paper.extraction.datasets or [])[:3]:
93
+ if not ds:
94
+ continue
95
+ session.run(
96
+ """
97
+ MERGE (d:Dataset {name: $name})
98
+ WITH d
99
+ MATCH (p:Paper {id: $paper_id})
100
+ MERGE (p)-[:TESTED_ON]->(d)
101
+ """,
102
+ name=ds, paper_id=paper.id,
103
+ )
104
+
105
+ print(f"[Neo4j] Wrote {len(papers)} papers to AuraDB.")
106
+
107
+
108
+ # ── Write contradictions ─────────────────────────────────────────────────────
109
+
110
+ def write_contradictions(contradictions: List[Contradiction]):
111
+ if not contradictions:
112
+ return
113
+ driver = get_driver()
114
+
115
+ with driver.session() as session:
116
+ for c in contradictions:
117
+ session.run(
118
+ """
119
+ MATCH (a:Paper {id: $paper_a})
120
+ MATCH (b:Paper {id: $paper_b})
121
+ MERGE (a)-[r:CONTRADICTS {id: $cid}]->(b)
122
+ SET r.severity = $severity,
123
+ r.explanation = $explanation
124
+ """,
125
+ paper_a = c.claim_a.paper_id,
126
+ paper_b = c.claim_b.paper_id,
127
+ cid = c.id,
128
+ severity = c.severity,
129
+ explanation= c.explanation[:300],
130
+ )
131
+
132
+ print(f"[Neo4j] Wrote {len(contradictions)} contradiction edges.")
133
+
134
+
135
+ # ── Query helpers for the API ─────────────────────────────────────────────────
136
+
137
+ def get_graph_for_session(session_id: str) -> dict:
138
+ """Return nodes + edges for a session as a dict compatible with the frontend."""
139
+ driver = get_driver()
140
+
141
+ with driver.session() as session:
142
+ # Nodes
143
+ node_result = session.run(
144
+ """
145
+ MATCH (p:Paper {sessionId: $sid})
146
+ OPTIONAL MATCH (p)-[:WRITTEN_BY]->(a:Author)
147
+ OPTIONAL MATCH (p)-[:USES]->(m:Method)
148
+ OPTIONAL MATCH (p)-[:TESTED_ON]->(d:Dataset)
149
+ RETURN p, collect(DISTINCT a) AS authors,
150
+ collect(DISTINCT m) AS methods,
151
+ collect(DISTINCT d) AS datasets
152
+ """,
153
+ sid=session_id,
154
+ )
155
+
156
+ nodes = []
157
+ edges = []
158
+ seen_node_ids: set[str] = set()
159
+
160
+ for record in node_result:
161
+ p = record["p"]
162
+ if p["id"] not in seen_node_ids:
163
+ seen_node_ids.add(p["id"])
164
+ nodes.append({"id": p["id"], "label": (p.get("title") or "")[:50], "type": "Paper"})
165
+
166
+ for author in record["authors"]:
167
+ if author is None:
168
+ continue
169
+ aid = f"author_{author['name']}"
170
+ if aid not in seen_node_ids:
171
+ seen_node_ids.add(aid)
172
+ nodes.append({"id": aid, "label": author["name"], "type": "Author"})
173
+ edges.append({"id": f"e_{p['id']}_{aid}", "source": p["id"], "target": aid, "type": "WRITTEN_BY"})
174
+
175
+ for method in record["methods"]:
176
+ if method is None:
177
+ continue
178
+ mid = f"method_{method['name'][:30]}"
179
+ if mid not in seen_node_ids:
180
+ seen_node_ids.add(mid)
181
+ nodes.append({"id": mid, "label": method["name"][:40], "type": "Method"})
182
+ edges.append({"id": f"e_{p['id']}_{mid}", "source": p["id"], "target": mid, "type": "USES"})
183
+
184
+ for dataset in record["datasets"]:
185
+ if dataset is None:
186
+ continue
187
+ did = f"dataset_{dataset['name'][:30]}"
188
+ if did not in seen_node_ids:
189
+ seen_node_ids.add(did)
190
+ nodes.append({"id": did, "label": dataset["name"], "type": "Dataset"})
191
+ edges.append({"id": f"e_{p['id']}_{did}", "source": p["id"], "target": did, "type": "TESTED_ON"})
192
+
193
+ # Contradiction edges
194
+ contra_result = session.run(
195
+ """
196
+ MATCH (a:Paper {sessionId: $sid})-[r:CONTRADICTS]->(b:Paper {sessionId: $sid})
197
+ RETURN a.id AS src, b.id AS tgt, r.id AS rid, r.severity AS severity
198
+ """,
199
+ sid=session_id,
200
+ )
201
+ for record in contra_result:
202
+ edges.append({
203
+ "id": record["rid"],
204
+ "source": record["src"],
205
+ "target": record["tgt"],
206
+ "type": "CONTRADICTS",
207
+ "label": record["severity"],
208
+ })
209
+
210
+ return {"nodes": nodes, "edges": edges}
app/services/parsing/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/parsing/pdf_parser.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Downloads and parses PDFs.
3
+ Primary: PyMuPDF (fitz) | Fallback: pdfplumber
4
+ """
5
+ import fitz # PyMuPDF
6
+ import pdfplumber
7
+ import httpx
8
+ import tempfile
9
+ import os
10
+ from typing import Optional
11
+
12
+
13
+ SECTION_KEYWORDS = ["abstract", "introduction", "related work", "background",
14
+ "methodology", "methods", "experiments", "results",
15
+ "discussion", "conclusion", "references", "limitations"]
16
+
17
+
18
+ def download_pdf(url: str) -> Optional[bytes]:
19
+ try:
20
+ with httpx.Client(timeout=30, follow_redirects=True) as client:
21
+ resp = client.get(url, headers={"User-Agent": "LitAgent/1.0"})
22
+ if resp.status_code == 200 and "pdf" in resp.headers.get("content-type", ""):
23
+ return resp.content
24
+ except Exception:
25
+ pass
26
+ return None
27
+
28
+
29
+ def extract_text_pymupdf(pdf_bytes: bytes) -> dict[str, str]:
30
+ """Returns dict of section_name -> text."""
31
+ sections: dict[str, str] = {}
32
+ current_section = "body"
33
+ buffer: list[str] = []
34
+
35
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
36
+ full_text = ""
37
+ for page in doc:
38
+ full_text += page.get_text("text") + "\n"
39
+
40
+ for line in full_text.split("\n"):
41
+ lowered = line.strip().lower()
42
+ if any(lowered.startswith(kw) for kw in SECTION_KEYWORDS):
43
+ if buffer:
44
+ sections[current_section] = "\n".join(buffer).strip()
45
+ buffer = []
46
+ current_section = lowered[:40]
47
+ else:
48
+ buffer.append(line)
49
+
50
+ if buffer:
51
+ sections[current_section] = "\n".join(buffer).strip()
52
+
53
+ return sections
54
+
55
+
56
+ def extract_text_pdfplumber(pdf_bytes: bytes) -> dict[str, str]:
57
+ """Fallback parser."""
58
+ text = ""
59
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
60
+ f.write(pdf_bytes)
61
+ tmp_path = f.name
62
+ try:
63
+ with pdfplumber.open(tmp_path) as pdf:
64
+ for page in pdf.pages:
65
+ text += (page.extract_text() or "") + "\n"
66
+ finally:
67
+ os.unlink(tmp_path)
68
+ return {"body": text}
69
+
70
+
71
+ def parse_pdf(url: str) -> Optional[dict[str, str]]:
72
+ """Main entry point β€” returns section dict or None if unavailable."""
73
+ pdf_bytes = download_pdf(url)
74
+ if not pdf_bytes:
75
+ return None
76
+ try:
77
+ sections = extract_text_pymupdf(pdf_bytes)
78
+ if len(sections) >= 2:
79
+ return sections
80
+ return extract_text_pdfplumber(pdf_bytes)
81
+ except Exception:
82
+ try:
83
+ return extract_text_pdfplumber(pdf_bytes)
84
+ except Exception:
85
+ return None
app/services/reporting/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/reporting/report_generator.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generates the final literature review report using LLM synthesis.
3
+ """
4
+ from typing import List
5
+ from ...schemas.research import Paper, Report, ResearchGap
6
+ from ...core.llm import chat_completion, chat_json
7
+ from datetime import datetime, timezone
8
+ import uuid
9
+
10
+ SYNTHESIS_PROMPT = """You are an expert academic research synthesizer. Given a list of paper summaries,
11
+ generate a comprehensive literature review report in Markdown format with these sections:
12
+ 1. Executive Summary (3-5 sentences)
13
+ 2. Trends in Literature
14
+ 3. Methodology Comparison
15
+ 4. Major Findings
16
+ 5. Identified Contradictions
17
+ 6. Research Gaps
18
+ 7. Suggested Future Directions
19
+ 8. References
20
+
21
+ Be analytical, cite papers by [Author, Year], and provide a confidence score (0-1) at the end as:
22
+ CONFIDENCE: 0.XX"""
23
+
24
+ GAP_PROMPT = """Given these paper limitations and contradictions, identify 3-5 distinct research gaps.
25
+ Return a JSON object with key "gaps" containing an array of objects:
26
+ {title, description, category, recurrence_count, importance_score, derivation_note}
27
+ Categories: missing_population, missing_replication, inconsistent_benchmarks, data_scarcity, interpretability, fairness, other"""
28
+
29
+
30
+ async def generate_report(query: str, papers: List[Paper], session_id: str) -> Report:
31
+ paper_texts = []
32
+ for p in papers[:20]:
33
+ summary = f"[{', '.join(p.authors[:2])}, {p.year}] {p.title}"
34
+ if p.extraction:
35
+ summary += f"\n Objective: {p.extraction.objective}"
36
+ summary += f"\n Method: {p.extraction.methodology[:100]}"
37
+ if p.extraction.key_findings:
38
+ summary += f"\n Findings: {'; '.join(p.extraction.key_findings[:2])}"
39
+ if p.extraction.limitations:
40
+ summary += f"\n Limitations: {'; '.join(p.extraction.limitations[:2])}"
41
+ paper_texts.append(summary)
42
+
43
+ context = f"Research Query: {query}\n\nPapers:\n" + "\n\n".join(paper_texts)
44
+
45
+ report_markdown = ""
46
+ confidence_score = 0.7
47
+ gaps: List[ResearchGap] = []
48
+
49
+ try:
50
+ report_markdown = await chat_completion(SYNTHESIS_PROMPT, context, temperature=0.3)
51
+
52
+ if "CONFIDENCE:" in report_markdown:
53
+ try:
54
+ line = [l for l in report_markdown.split("\n") if "CONFIDENCE:" in l][0]
55
+ confidence_score = float(line.split(":")[1].strip())
56
+ except Exception:
57
+ pass
58
+
59
+ lim_context = "\n".join(
60
+ f"- [{p.title[:40]}]: {'; '.join((p.extraction.limitations or [])[:2])}"
61
+ for p in papers if p.extraction and p.extraction.limitations
62
+ )[:2000]
63
+
64
+ if lim_context:
65
+ gap_data = await chat_json(GAP_PROMPT, lim_context)
66
+ raw_gaps = gap_data.get("gaps", gap_data) if isinstance(gap_data, dict) else gap_data
67
+ if isinstance(raw_gaps, list):
68
+ gaps = [
69
+ ResearchGap(id=str(uuid.uuid4()), session_id=session_id, **g)
70
+ for g in raw_gaps
71
+ ]
72
+
73
+ except Exception as e:
74
+ report_markdown = (
75
+ f"# Literature Review: {query}\n\n"
76
+ f"*Report generation encountered an error: {e}*\n\n"
77
+ + "\n\n".join(f"- **{p.title}** ({p.year})" for p in papers[:10])
78
+ )
79
+
80
+ return Report(
81
+ id=str(uuid.uuid4()),
82
+ session_id=session_id,
83
+ report_markdown=report_markdown,
84
+ research_gaps=gaps,
85
+ confidence_score=confidence_score,
86
+ created_at=datetime.now(timezone.utc).isoformat(),
87
+ )
app/services/retrieval/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/services/retrieval/arxiv_adapter.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """arXiv paper retrieval adapter."""
2
+ import arxiv
3
+ from typing import List
4
+ from ...schemas.research import Paper, PaperSource
5
+ import uuid
6
+
7
+
8
+ def search(query: str, session_id: str, max_results: int = 15) -> List[Paper]:
9
+ client = arxiv.Client()
10
+ search_obj = arxiv.Search(
11
+ query=query,
12
+ max_results=max_results,
13
+ sort_by=arxiv.SortCriterion.Relevance,
14
+ )
15
+ papers = []
16
+ for r in client.results(search_obj):
17
+ papers.append(Paper(
18
+ id=str(uuid.uuid4()),
19
+ session_id=session_id,
20
+ external_source=PaperSource.arxiv,
21
+ source_paper_id=r.entry_id,
22
+ title=r.title,
23
+ authors=[a.name for a in r.authors],
24
+ year=r.published.year if r.published else None,
25
+ abstract=r.summary,
26
+ doi=r.doi,
27
+ venue="arXiv",
28
+ pdf_url=r.pdf_url,
29
+ ))
30
+ return papers
app/services/retrieval/query_decomposer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Decomposes a raw research query into a structured search plan using LLM.
3
+ """
4
+ from pydantic import BaseModel
5
+ from typing import List
6
+ from ...core.llm import chat_json
7
+
8
+
9
+ class QueryPlan(BaseModel):
10
+ main_topic: str
11
+ subtopics: List[str]
12
+ filters: dict
13
+ query_variants: List[str]
14
+
15
+
16
+ SYSTEM_PROMPT = """You are a research librarian. Given a user's research question,
17
+ decompose it into a structured search plan. Return valid JSON only with these keys:
18
+ - main_topic: string
19
+ - subtopics: list of strings
20
+ - filters: {"year_from": int or null}
21
+ - query_variants: list of 3-5 search strings for different academic databases"""
22
+
23
+
24
+ async def decompose_query(query: str) -> QueryPlan:
25
+ try:
26
+ data = await chat_json(SYSTEM_PROMPT, query)
27
+ return QueryPlan(**data)
28
+ except Exception:
29
+ return QueryPlan(
30
+ main_topic=query[:80],
31
+ subtopics=[],
32
+ filters={},
33
+ query_variants=[query],
34
+ )
app/services/retrieval/ranker.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scores and ranks papers by relevance to the original query.
3
+ Uses: semantic similarity (sentence-transformers) + citation count + recency.
4
+ """
5
+ from typing import List
6
+ from ...schemas.research import Paper
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import torch
9
+
10
+ _model: SentenceTransformer | None = None
11
+
12
+
13
+ def _get_model() -> SentenceTransformer:
14
+ global _model
15
+ if _model is None:
16
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
17
+ return _model
18
+
19
+
20
+ def rank_papers(query: str, papers: List[Paper], top_k: int = 20) -> List[Paper]:
21
+ if not papers:
22
+ return []
23
+
24
+ model = _get_model()
25
+ texts = [f"{p.title}. {p.abstract[:300]}" for p in papers]
26
+ q_emb = model.encode(query, convert_to_tensor=True)
27
+ p_embs = model.encode(texts, convert_to_tensor=True)
28
+
29
+ sem_scores = util.cos_sim(q_emb, p_embs)[0].tolist()
30
+
31
+ # Normalise citation counts
32
+ max_citations = max((p.citation_count or 0 for p in papers), default=1) or 1
33
+ current_year = 2025
34
+
35
+ scored = []
36
+ for i, paper in enumerate(papers):
37
+ sem = sem_scores[i]
38
+ cite = (paper.citation_count or 0) / max_citations
39
+ rec = max(0, 1 - (current_year - (paper.year or current_year)) / 10)
40
+ score = 0.6 * sem + 0.25 * cite + 0.15 * rec
41
+ paper.relevance_score = round(float(score), 4)
42
+ scored.append(paper)
43
+
44
+ scored.sort(key=lambda p: p.relevance_score or 0, reverse=True)
45
+ return scored[:top_k]
app/services/retrieval/semantic_scholar_adapter.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Semantic Scholar paper retrieval adapter."""
2
+ import httpx
3
+ from typing import List
4
+ from ...schemas.research import Paper, PaperSource
5
+ from ...core.config import get_settings
6
+ import uuid
7
+ import time
8
+
9
+ BASE = "https://api.semanticscholar.org/graph/v1"
10
+ FIELDS = "paperId,title,authors,year,abstract,citationCount,venue,externalIds,openAccessPdf"
11
+
12
+
13
+ def _headers() -> dict:
14
+ key = get_settings().semantic_scholar_api_key
15
+ return {"x-api-key": key} if key else {}
16
+
17
+
18
+ def search(query: str, session_id: str, limit: int = 10) -> List[Paper]:
19
+ try:
20
+ with httpx.Client(timeout=15) as client:
21
+ resp = client.get(
22
+ f"{BASE}/paper/search",
23
+ params={"query": query, "limit": limit, "fields": FIELDS},
24
+ headers=_headers(),
25
+ )
26
+ if resp.status_code == 429:
27
+ print("[SemanticScholar] Rate limited β€” skipping, using arXiv only.")
28
+ return []
29
+ if resp.status_code != 200:
30
+ print(f"[SemanticScholar] search returned {resp.status_code} β€” skipping.")
31
+ return []
32
+ data = resp.json().get("data", []) or []
33
+ except Exception as e:
34
+ print(f"[SemanticScholar] search error: {e}")
35
+ return []
36
+
37
+ papers = []
38
+ for item in data:
39
+ try:
40
+ papers.append(Paper(
41
+ id=str(uuid.uuid4()),
42
+ session_id=session_id,
43
+ external_source=PaperSource.semantic_scholar,
44
+ source_paper_id=item.get("paperId", ""),
45
+ title=item.get("title") or "",
46
+ authors=[a.get("name", "") for a in (item.get("authors") or [])],
47
+ year=item.get("year"),
48
+ abstract=item.get("abstract") or "",
49
+ doi=item.get("externalIds", {}).get("DOI"),
50
+ venue=item.get("venue"),
51
+ citation_count=item.get("citationCount"),
52
+ pdf_url=(item.get("openAccessPdf") or {}).get("url"),
53
+ ))
54
+ except Exception:
55
+ continue
56
+ return papers
57
+
58
+
59
+ def get_references(paper_id: str, session_id: str, limit: int = 10) -> List[Paper]:
60
+ if not paper_id:
61
+ return []
62
+ try:
63
+ time.sleep(1) # Be polite to avoid 429
64
+ with httpx.Client(timeout=15) as client:
65
+ resp = client.get(
66
+ f"{BASE}/paper/{paper_id}/references",
67
+ params={"limit": limit, "fields": FIELDS},
68
+ headers=_headers(),
69
+ )
70
+ if resp.status_code != 200:
71
+ return []
72
+ data = resp.json().get("data", []) or []
73
+ except Exception as e:
74
+ print(f"[SemanticScholar] get_references error: {e}")
75
+ return []
76
+
77
+ results = []
78
+ for item in (data or []):
79
+ cited = item.get("citedPaper") or {}
80
+ if not cited.get("title"):
81
+ continue
82
+ try:
83
+ results.append(Paper(
84
+ id=str(uuid.uuid4()),
85
+ session_id=session_id,
86
+ external_source=PaperSource.semantic_scholar,
87
+ source_paper_id=cited.get("paperId", ""),
88
+ title=cited.get("title", ""),
89
+ authors=[a.get("name", "") for a in (cited.get("authors") or [])],
90
+ year=cited.get("year"),
91
+ abstract=cited.get("abstract") or "",
92
+ ))
93
+ except Exception:
94
+ continue
95
+ return results
app/workers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ο»Ώ
app/workers/pipeline.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Research pipeline using FastAPI BackgroundTasks (no Redis/Celery required).
3
+ """
4
+ import asyncio
5
+ import uuid
6
+ from datetime import datetime, timezone
7
+
8
+
9
+ def _update_status(db, session_id: str, status: str, extra: dict = {}):
10
+ try:
11
+ db.collection("research_sessions").document(session_id).update(
12
+ {"status": status, **extra}
13
+ )
14
+ print(f"[Pipeline] {session_id[:8]}... β†’ {status}")
15
+ except Exception as e:
16
+ print(f"[Pipeline] Failed to update status: {e}")
17
+
18
+
19
+ async def run_research_pipeline(session_id: str, query: str, user_id: str):
20
+ """Full async pipeline β€” called as a FastAPI background task."""
21
+ from ..core.firebase import get_db, init_firebase
22
+ from ..services.retrieval.query_decomposer import decompose_query
23
+ from ..services.retrieval import arxiv_adapter, semantic_scholar_adapter
24
+ from ..services.retrieval.ranker import rank_papers
25
+ from ..services.parsing.pdf_parser import parse_pdf
26
+ from ..services.extraction.extractor import extract_paper
27
+ from ..services.extraction.contradiction_detector import detect_contradictions
28
+ from ..services.graph.graph_builder import build_graph
29
+ from ..services.reporting.report_generator import generate_report
30
+ from ..services.council.council_runner import council_validate_report
31
+ from ..schemas.research import Claim, ClaimType
32
+ from ..core.config import get_settings
33
+
34
+ init_firebase()
35
+ db = get_db()
36
+ settings = get_settings()
37
+
38
+ try:
39
+ # ── Stage 1: Retrieve papers ──────────────────────────────────────────
40
+ _update_status(db, session_id, "retrieving")
41
+ query_plan = await decompose_query(query)
42
+ all_papers = []
43
+
44
+ for variant in (query_plan.query_variants or [query])[:2]:
45
+ # arXiv β€” always reliable
46
+ try:
47
+ all_papers += arxiv_adapter.search(variant, session_id, max_results=10)
48
+ print(f"[arXiv] fetched {len(all_papers)} papers for: {variant[:50]}")
49
+ except Exception as e:
50
+ print(f"[arXiv] error: {e}")
51
+
52
+ # Semantic Scholar β€” skip gracefully if rate-limited
53
+ try:
54
+ ss = semantic_scholar_adapter.search(variant, session_id, limit=8)
55
+ all_papers += ss
56
+ print(f"[S2] fetched {len(ss)} papers")
57
+ except Exception as e:
58
+ print(f"[S2] error: {e}")
59
+
60
+ # Deduplicate by title
61
+ seen: set[str] = set()
62
+ unique_papers = []
63
+ for p in all_papers:
64
+ key = (p.title or "").lower()[:60]
65
+ if key and key not in seen:
66
+ seen.add(key)
67
+ unique_papers.append(p)
68
+
69
+ print(f"[Pipeline] {len(unique_papers)} unique papers after dedup")
70
+
71
+ if not unique_papers:
72
+ _update_status(db, session_id, "failed", {"error": "No papers found. Try a different query."})
73
+ return
74
+
75
+ # ── Stage 2: Rank ─────────────────────────────────────────────────────
76
+ _update_status(db, session_id, "ranking")
77
+ try:
78
+ ranked = rank_papers(query, unique_papers, top_k=20)
79
+ except Exception as e:
80
+ print(f"[Ranker] error: {e} β€” using unranked")
81
+ ranked = unique_papers[:20]
82
+
83
+ # ── Stage 3: Citation expansion ───────────────────────────────────────
84
+ _update_status(db, session_id, "expanding_citations")
85
+ for paper in ranked[:3]:
86
+ if paper.external_source == "semantic_scholar" and paper.source_paper_id:
87
+ try:
88
+ extras = semantic_scholar_adapter.get_references(
89
+ paper.source_paper_id, session_id, limit=5
90
+ ) or []
91
+ for ep in extras:
92
+ key = (ep.title or "").lower()[:60]
93
+ if key and key not in seen:
94
+ seen.add(key)
95
+ ranked.append(ep)
96
+ except Exception as e:
97
+ print(f"[Citations] error: {e}")
98
+
99
+ final_papers = ranked[:settings.max_papers_per_session]
100
+ _update_status(db, session_id, "extracting", {"paperCount": len(final_papers)})
101
+
102
+ # Save basic paper metadata immediately so UI can show them
103
+ for paper in final_papers:
104
+ try:
105
+ db.collection("papers").document(paper.id).set({
106
+ **paper.model_dump(),
107
+ "sessionId": session_id,
108
+ })
109
+ except Exception as e:
110
+ print(f"[Firestore] paper save error: {e}")
111
+
112
+ # ── Stage 4: PDF Parsing ────────────────────────────���─────────────────
113
+ _update_status(db, session_id, "parsing_pdfs")
114
+ paper_sections: dict[str, dict] = {}
115
+ for paper in final_papers:
116
+ if paper.pdf_url:
117
+ try:
118
+ sections = parse_pdf(paper.pdf_url)
119
+ paper_sections[paper.id] = sections or {}
120
+ except Exception as e:
121
+ print(f"[PDF] parse error for {paper.title[:40]}: {e}")
122
+ paper_sections[paper.id] = {}
123
+
124
+ # ── Stage 5: LLM Extraction ───────────────────────────────────────────
125
+ _update_status(db, session_id, "extracting")
126
+ for paper in final_papers:
127
+ try:
128
+ sections = paper_sections.get(paper.id, {})
129
+ paper.extraction = await extract_paper(paper.title, paper.abstract, sections)
130
+ db.collection("papers").document(paper.id).update(
131
+ {"extraction": paper.extraction.model_dump()}
132
+ )
133
+ except Exception as e:
134
+ print(f"[Extract] error for {paper.title[:40]}: {e}")
135
+
136
+ # ── Stage 6: Graph + Contradictions ───────────────────────────────────
137
+ _update_status(db, session_id, "building_graph")
138
+ claims: list[Claim] = []
139
+ for paper in final_papers:
140
+ if paper.extraction and paper.extraction.key_findings:
141
+ for finding in paper.extraction.key_findings[:2]:
142
+ claims.append(Claim(
143
+ id=str(uuid.uuid4()),
144
+ paper_id=paper.id,
145
+ paper_title=paper.title,
146
+ claim_text=finding,
147
+ claim_type=ClaimType.empirical,
148
+ evidence_span=finding,
149
+ confidence=0.75,
150
+ ))
151
+
152
+ try:
153
+ contradictions = await detect_contradictions(claims)
154
+ for c in contradictions:
155
+ c.session_id = session_id
156
+ db.collection("contradictions").document(c.id).set(c.model_dump())
157
+ except Exception as e:
158
+ print(f"[Contradictions] error: {e}")
159
+ contradictions = []
160
+
161
+ try:
162
+ graph = build_graph(final_papers)
163
+ db.collection("graphs").document(session_id).set(graph.model_dump())
164
+
165
+ # ── Persist to Neo4j AuraDB ───────────────────────────────────────
166
+ try:
167
+ from ..core.neo4j_client import get_driver
168
+ from ..services.graph.neo4j_writer import (
169
+ ensure_constraints, write_papers, write_contradictions,
170
+ )
171
+ ensure_constraints()
172
+ write_papers(final_papers, session_id)
173
+ write_contradictions(contradictions)
174
+ print(f"[Neo4j] Knowledge graph persisted for session {session_id}.")
175
+ except Exception as neo4j_err:
176
+ print(f"[Neo4j] Non-fatal β€” could not write to AuraDB: {neo4j_err}")
177
+ except Exception as e:
178
+ print(f"[Graph] error: {e}")
179
+
180
+ # ── Stage 7: Report ───────────────────────────────────────────────────
181
+ _update_status(db, session_id, "generating_report")
182
+ try:
183
+ report = await generate_report(query, final_papers, session_id)
184
+ except Exception as e:
185
+ print(f"[Report] error: {e}")
186
+ from ..schemas.research import Report
187
+ report = Report(
188
+ id=str(uuid.uuid4()),
189
+ session_id=session_id,
190
+ report_markdown=f"# Literature Review: {query}\n\nReport generation failed: {e}",
191
+ confidence_score=0.0,
192
+ created_at=datetime.now(timezone.utc).isoformat(),
193
+ )
194
+
195
+ # ── Stage 8: LLM Council ──────────────────────────────────────────────
196
+ _update_status(db, session_id, "council_review")
197
+ try:
198
+ papers_summary = "\n".join(
199
+ f"- {p.title}: {p.extraction.summary if p.extraction else ''}"
200
+ for p in final_papers[:8]
201
+ )
202
+ council_notes = await council_validate_report(report.report_markdown, papers_summary)
203
+ report.report_markdown += (
204
+ "\n\n---\n## LLM Council Notes\n"
205
+ f"**Extractor:** {council_notes['extractor_notes'][:300]}\n\n"
206
+ f"**Skeptic:** {council_notes['skeptic_notes'][:300]}\n\n"
207
+ f"**Synthesizer:** {council_notes['synthesizer_notes'][:300]}"
208
+ )
209
+ except Exception as e:
210
+ print(f"[Council] error: {e}")
211
+
212
+ db.collection("reports").document(report.id).set(report.model_dump())
213
+
214
+ # ── Done ──────────────────────────────────────────────────────────────
215
+ _update_status(db, session_id, "completed", {
216
+ "completedAt": datetime.now(timezone.utc).isoformat(),
217
+ "paperCount": len(final_papers),
218
+ })
219
+ print(f"[Pipeline] COMPLETE β€” {len(final_papers)} papers, session {session_id[:8]}")
220
+
221
+ except Exception as exc:
222
+ print(f"[Pipeline] FATAL: {exc}")
223
+ try:
224
+ _update_status(db, session_id, "failed", {"error": str(exc)})
225
+ except Exception:
226
+ pass
227
+ raise
requirements.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web framework
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.32.0
4
+ python-multipart>=0.0.12
5
+
6
+ # Firebase Admin (replaces PostgreSQL)
7
+ firebase-admin>=6.6.0
8
+
9
+ # Background tasks (FastAPI BackgroundTasks β€” no Redis required)
10
+
11
+ # LLM adapters
12
+ groq>=0.13.0
13
+ openai>=1.58.0
14
+ anthropic>=0.40.0
15
+ google-generativeai>=0.8.3
16
+
17
+ # PDF parsing
18
+ PyMuPDF>=1.24.14
19
+ pdfplumber>=0.11.4
20
+
21
+ # NLP / Embeddings
22
+ sentence-transformers>=3.3.1
23
+ scikit-learn>=1.5.2
24
+ spacy>=3.8.3
25
+
26
+ # Graph
27
+ networkx>=3.4.2
28
+
29
+ # Academic APIs
30
+ arxiv>=2.1.3
31
+ requests>=2.32.3
32
+ httpx>=0.28.1
33
+
34
+ # Validation
35
+ pydantic>=2.10.3
36
+ pydantic-settings>=2.6.1
37
+
38
+ # Utilities
39
+ python-dotenv>=1.0.1
40
+ tenacity>=9.0.0