Spaces:

ketannnn
/

litagent

Sleeping

Vansh180 commited on Mar 13

Commit

7dd9eed

0 Parent(s):

feat: full-stack LitAgent MVP with Neo4j knowledge graph integration

- FastAPI backend: pipeline (arXiv + Semantic Scholar), extraction, contradiction detection, report generation, LLM Council
- Firebase Auth + Firestore for sessions, papers, reports
- Neo4j AuraDB: writes Paper/Author/Method/Dataset nodes and CITES/USES/CONTRADICTS edges per session
- Next.js frontend: dashboard, session page with stepper, tabs (papers, report, graph, contradictions, gaps)
- ReactFlow knowledge graph viewer with stable nodeTypes and memoized nodes/edges
- Graceful 404 handling on report/graph tabs with auto-fetch on session completion
- Groq (llama-3.3-70b-versatile) as default LLM provider

Made-with: Cursor

Files changed (33) hide show

app/__init__.py +1 -0
app/api/__init__.py +1 -0
app/api/research.py +100 -0
app/core/__init__.py +1 -0
app/core/auth.py +45 -0
app/core/config.py +42 -0
app/core/firebase.py +59 -0
app/core/llm.py +87 -0
app/core/neo4j_client.py +34 -0
app/main.py +32 -0
app/schemas/__init__.py +1 -0
app/schemas/research.py +150 -0
app/services/__init__.py +1 -0
app/services/council/__init__.py +1 -0
app/services/council/council_runner.py +49 -0
app/services/extraction/__init__.py +1 -0
app/services/extraction/contradiction_detector.py +75 -0
app/services/extraction/extractor.py +37 -0
app/services/graph/__init__.py +1 -0
app/services/graph/graph_builder.py +64 -0
app/services/graph/neo4j_writer.py +210 -0
app/services/parsing/__init__.py +1 -0
app/services/parsing/pdf_parser.py +85 -0
app/services/reporting/__init__.py +1 -0
app/services/reporting/report_generator.py +87 -0
app/services/retrieval/__init__.py +1 -0
app/services/retrieval/arxiv_adapter.py +30 -0
app/services/retrieval/query_decomposer.py +34 -0
app/services/retrieval/ranker.py +45 -0
app/services/retrieval/semantic_scholar_adapter.py +95 -0
app/workers/__init__.py +1 -0
app/workers/pipeline.py +227 -0
requirements.txt +40 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/api/research.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from ..schemas.research import (
+    ResearchQueryRequest, ResearchQueryResponse, SessionStatusResponse,
+)
+from ..core.auth import get_current_user
+from ..core.firebase import get_db
+from ..workers.pipeline import run_research_pipeline
+import uuid
+from datetime import datetime, timezone
+router = APIRouter(prefix="/research", tags=["research"])
+@router.post("/query", response_model=ResearchQueryResponse)
+async def submit_query(
+    body: ResearchQueryRequest,
+    background_tasks: BackgroundTasks,
+    user: dict = Depends(get_current_user),
+):
+    db         = get_db()
+    session_id = str(uuid.uuid4())
+    now        = datetime.now(timezone.utc).isoformat()
+    db.collection("research_sessions").document(session_id).set({
+        "userId":     user["uid"],
+        "query":      body.query,
+        "status":     "accepted",
+        "createdAt":  now,
+        "paperCount": 0,
+    })
+    background_tasks.add_task(run_research_pipeline, session_id, body.query, user["uid"])
+    return ResearchQueryResponse(session_id=session_id)
+@router.get("/{session_id}/status", response_model=SessionStatusResponse)
+async def get_status(
+    session_id: str,
+    user: dict = Depends(get_current_user),
+):
+    db  = get_db()
+    doc = db.collection("research_sessions").document(session_id).get()
+    if not doc.exists:
+        raise HTTPException(status_code=404, detail="Session not found")
+    data = doc.to_dict()
+    if data.get("userId") != user["uid"]:
+        raise HTTPException(status_code=403, detail="Forbidden")
+    return SessionStatusResponse(
+        session_id=session_id,
+        status=data["status"],
+        paper_count=data.get("paperCount"),
+    )
+@router.get("/{session_id}/papers")
+async def get_papers(session_id: str, user: dict = Depends(get_current_user)):
+    db    = get_db()
+    docs  = db.collection("papers").where("sessionId", "==", session_id).stream()
+    papers = [{"id": d.id, **d.to_dict()} for d in docs]
+    return {"papers": papers}
+@router.get("/{session_id}/report")
+async def get_report(session_id: str, user: dict = Depends(get_current_user)):
+    db   = get_db()
+    docs = db.collection("reports").where("sessionId", "==", session_id).limit(1).stream()
+    report = next(({"id": d.id, **d.to_dict()} for d in docs), None)
+    if not report:
+        raise HTTPException(status_code=404, detail="Report not ready yet")
+    return {"report": report}
+@router.get("/{session_id}/graph")
+async def get_graph(session_id: str, user: dict = Depends(get_current_user)):
+    db  = get_db()
+    doc = db.collection("graphs").document(session_id).get()
+    if not doc.exists:
+        raise HTTPException(status_code=404, detail="Graph not ready yet")
+    return {"graph": doc.to_dict()}
+@router.get("/{session_id}/neo4j-graph")
+async def get_neo4j_graph(session_id: str, user: dict = Depends(get_current_user)):
+    """Returns the knowledge graph directly from Neo4j AuraDB."""
+    try:
+        from ..services.graph.neo4j_writer import get_graph_for_session
+        data = get_graph_for_session(session_id)
+        return {"graph": data}
+    except RuntimeError as e:
+        raise HTTPException(status_code=503, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Neo4j query failed: {e}")
+@router.get("/{session_id}/contradictions")
+async def get_contradictions(session_id: str, user: dict = Depends(get_current_user)):
+    db   = get_db()
+    docs = db.collection("contradictions").where("sessionId", "==", session_id).stream()
+    return {"contradictions": [{"id": d.id, **d.to_dict()} for d in docs]}

app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/core/auth.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import base64
+import json
+from fastapi import Depends, HTTPException, status
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+bearer_scheme = HTTPBearer(auto_error=False)
+def _decode_jwt_payload(token: str) -> dict:
+    """
+    Decode Firebase JWT payload without network verification.
+    Firebase tokens are standard JWTs — we extract uid from the payload.
+    """
+    try:
+        payload_part = token.split('.')[1]
+        # Fix base64 padding
+        payload_part += '=' * (4 - len(payload_part) % 4)
+        decoded = base64.urlsafe_b64decode(payload_part)
+        return json.loads(decoded)
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Could not decode token: {e}"
+        )
+def get_current_user(
+    credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme),
+) -> dict:
+    if not credentials or not credentials.credentials:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Not authenticated — please log in"
+        )
+    payload = _decode_jwt_payload(credentials.credentials)
+    uid = payload.get("user_id") or payload.get("sub")
+    if not uid:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Token missing user ID"
+        )
+    return {"uid": uid, "email": payload.get("email", "")}

app/core/config.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pydantic_settings import BaseSettings
+from functools import lru_cache
+class Settings(BaseSettings):
+    # Neo4j AuraDB
+    neo4j_uri:      str = ""
+    neo4j_username: str = ""
+    neo4j_password: str = ""
+    # Firebase
+    firebase_credentials_path: str = "serviceAccountKey.json"
+    firebase_project_id: str = ""
+    # LLM
+    groq_api_key: str = ""
+    openai_api_key: str = ""
+    anthropic_api_key: str = ""
+    gemini_api_key: str = ""
+    default_llm: str = "groq"  # groq | openai | anthropic | gemini
+    # Academic APIs
+    semantic_scholar_api_key: str = ""
+    pubmed_email: str = ""
+    # Celery / Redis
+    redis_url: str = "redis://localhost:6379/0"
+    celery_broker_url: str = "redis://localhost:6379/0"
+    celery_result_backend: str = "redis://localhost:6379/1"
+    # Pipeline limits
+    max_papers_per_session: int = 50
+    max_citation_depth: int = 1
+    max_expansion_per_paper: int = 10
+    class Config:
+        env_file = ".env"
+@lru_cache
+def get_settings() -> Settings:
+    return Settings()

app/core/firebase.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+# Fix gRPC DNS resolution on Windows before any grpc import
+os.environ.setdefault("GRPC_DNS_RESOLVER", "native")
+import firebase_admin
+from firebase_admin import credentials, firestore, auth
+from .config import get_settings
+_initialized = False
+_db = None
+def init_firebase():
+    global _initialized
+    if _initialized or firebase_admin._apps:
+        _initialized = True
+        return
+    settings  = get_settings()
+    cred_path = settings.firebase_credentials_path
+    if not os.path.exists(cred_path):
+        print(
+            f"[Firebase] WARNING: '{cred_path}' not found. "
+            "Download from Firebase Console > Project Settings > Service Accounts."
+        )
+        return
+    cred = credentials.Certificate(cred_path)
+    firebase_admin.initialize_app(cred)
+    _initialized = True
+    print("[Firebase] Initialised successfully.")
+def get_db():
+    global _db
+    if _db is not None:
+        return _db
+    if not firebase_admin._apps:
+        init_firebase()
+    if not firebase_admin._apps:
+        raise RuntimeError(
+            "Firebase is not initialised. "
+            "Place serviceAccountKey.json in the backend/ folder and restart."
+        )
+    _db = firestore.client()
+    print("[Firebase] Firestore client ready.")
+    return _db
+def verify_token(token: str) -> dict:
+    if not firebase_admin._apps:
+        init_firebase()
+    if not firebase_admin._apps:
+        raise RuntimeError("Firebase is not initialised.")
+    return auth.verify_id_token(token)

app/core/llm.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Centralised LLM client.
+Supports: groq | openai | anthropic
+All services should call `chat_completion()` instead of importing SDK clients directly.
+"""
+import json
+from typing import Any
+from .config import get_settings
+# Groq model to use — fast and capable for structured extraction tasks
+GROQ_MODEL   = "llama-3.3-70b-versatile"
+OPENAI_MODEL = "gpt-4o-mini"
+CLAUDE_MODEL = "claude-3-haiku-20240307"
+async def chat_completion(
+    system: str,
+    user: str,
+    json_mode: bool = False,
+    temperature: float = 0,
+) -> str:
+    """
+    Single entry point for all LLM calls.
+    Returns the raw string content of the assistant reply.
+    """
+    settings = get_settings()
+    provider = settings.default_llm
+    if provider == "groq":
+        return await _groq(system, user, json_mode, temperature, settings)
+    if provider == "openai":
+        return await _openai(system, user, json_mode, temperature, settings)
+    if provider == "anthropic":
+        return await _anthropic(system, user, temperature, settings)
+    raise ValueError(f"Unknown LLM provider: {provider}")
+async def _groq(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
+    from groq import AsyncGroq
+    client = AsyncGroq(api_key=settings.groq_api_key)
+    kwargs: dict[str, Any] = {
+        "model":       GROQ_MODEL,
+        "messages":    [{"role": "system", "content": system}, {"role": "user", "content": user}],
+        "temperature": temperature,
+    }
+    if json_mode:
+        kwargs["response_format"] = {"type": "json_object"}
+    resp = await client.chat.completions.create(**kwargs)
+    return resp.choices[0].message.content or ""
+async def _openai(system: str, user: str, json_mode: bool, temperature: float, settings) -> str:
+    from openai import AsyncOpenAI
+    client = AsyncOpenAI(api_key=settings.openai_api_key)
+    kwargs: dict[str, Any] = {
+        "model":       OPENAI_MODEL,
+        "messages":    [{"role": "system", "content": system}, {"role": "user", "content": user}],
+        "temperature": temperature,
+    }
+    if json_mode:
+        kwargs["response_format"] = {"type": "json_object"}
+    resp = await client.chat.completions.create(**kwargs)
+    return resp.choices[0].message.content or ""
+async def _anthropic(system: str, user: str, temperature: float, settings) -> str:
+    import anthropic
+    client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
+    msg = await client.messages.create(
+        model=CLAUDE_MODEL,
+        max_tokens=1024,
+        system=system,
+        messages=[{"role": "user", "content": user}],
+        temperature=temperature,
+    )
+    return msg.content[0].text
+async def chat_json(system: str, user: str) -> dict:
+    """Convenience wrapper — returns parsed JSON dict."""
+    raw = await chat_completion(system, user, json_mode=True, temperature=0)
+    return json.loads(raw)

app/core/neo4j_client.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Neo4j AuraDB client — singleton driver with helper methods.
+"""
+from neo4j import GraphDatabase, Driver
+from .config import get_settings
+_driver: Driver | None = None
+def get_driver() -> Driver:
+    global _driver
+    if _driver is not None:
+        return _driver
+    settings = get_settings()
+    if not settings.neo4j_uri or not settings.neo4j_password:
+        raise RuntimeError(
+            "Neo4j not configured. Set NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD in .env"
+        )
+    _driver = GraphDatabase.driver(
+        settings.neo4j_uri,
+        auth=(settings.neo4j_username, settings.neo4j_password),
+    )
+    _driver.verify_connectivity()
+    print("[Neo4j] Connected to AuraDB successfully.")
+    return _driver
+def close_driver():
+    global _driver
+    if _driver:
+        _driver.close()
+        _driver = None

app/main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+# Must be set before any grpc/firebase import to fix Windows DNS resolution
+os.environ["GRPC_DNS_RESOLVER"] = "native"
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .core.firebase import init_firebase
+from .api import research
+app = FastAPI(
+    title="LitAgent API",
+    description="Autonomous Research Literature Agent backend",
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:3000", "https://your-production-domain.com"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(research.router)
+@app.on_event("startup")
+async def startup():
+    init_firebase()
+@app.get("/health")
+async def health():
+    return {"status": "ok", "service": "litagent-api"}

app/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/schemas/research.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from pydantic import BaseModel, Field
+from typing import Optional, List
+from enum import Enum
+class SessionStatus(str, Enum):
+    accepted            = "accepted"
+    retrieving          = "retrieving"
+    ranking             = "ranking"
+    expanding_citations = "expanding_citations"
+    parsing_pdfs        = "parsing_pdfs"
+    extracting          = "extracting"
+    building_graph      = "building_graph"
+    generating_report   = "generating_report"
+    council_review      = "council_review"
+    completed           = "completed"
+    failed              = "failed"
+class ResearchQueryRequest(BaseModel):
+    query: str = Field(..., min_length=10, max_length=1000)
+class ResearchQueryResponse(BaseModel):
+    session_id: str
+    status: SessionStatus = SessionStatus.accepted
+class SessionStatusResponse(BaseModel):
+    session_id: str
+    status: SessionStatus
+    step: Optional[str] = None
+    paper_count: Optional[int] = None
+class PaperSource(str, Enum):
+    arxiv            = "arxiv"
+    pubmed           = "pubmed"
+    semantic_scholar = "semantic_scholar"
+    openalex         = "openalex"
+    crossref         = "crossref"
+class PaperExtraction(BaseModel):
+    objective:    str = ""
+    methodology:  str = ""
+    datasets:     List[str] = []
+    metrics:      List[str] = []
+    key_findings: List[str] = []
+    limitations:  List[str] = []
+    future_work:  List[str] = []
+    summary:      str = ""
+class Paper(BaseModel):
+    id:              str
+    session_id:      str
+    external_source: PaperSource
+    source_paper_id: str
+    title:           str
+    authors:         List[str] = []
+    year:            Optional[int] = None
+    abstract:        str = ""
+    doi:             Optional[str] = None
+    venue:           Optional[str] = None
+    citation_count:  Optional[int] = None
+    pdf_url:         Optional[str] = None
+    relevance_score: Optional[float] = None
+    extraction:      Optional[PaperExtraction] = None
+class ClaimType(str, Enum):
+    performance    = "performance"
+    methodological = "methodological"
+    theoretical    = "theoretical"
+    empirical      = "empirical"
+    limitation     = "limitation"
+class Claim(BaseModel):
+    id:           str
+    paper_id:     str
+    paper_title:  str
+    claim_text:   str
+    claim_type:   ClaimType
+    evidence_span: str = ""
+    confidence:   float = 0.0
+    entities:     List[str] = []
+class ContradictionSeverity(str, Enum):
+    direct     = "direct"
+    partial    = "partial"
+    contextual = "contextual"
+class Contradiction(BaseModel):
+    id:          str
+    session_id:  str
+    claim_a:     Claim
+    claim_b:     Claim
+    severity:    ContradictionSeverity
+    explanation: str
+    reason:      Optional[str] = None
+class ResearchGap(BaseModel):
+    id:               str
+    session_id:       str
+    title:            str
+    description:      str
+    category:         str
+    recurrence_count: int = 0
+    importance_score: float = 0.0
+    source_paper_ids: List[str] = []
+    derivation_note:  str = ""
+class GraphNode(BaseModel):
+    id:    str
+    label: str
+    type:  str
+    data:  dict = {}
+class GraphEdge(BaseModel):
+    id:     str
+    source: str
+    target: str
+    type:   str
+    label:  Optional[str] = None
+class KnowledgeGraph(BaseModel):
+    nodes: List[GraphNode] = []
+    edges: List[GraphEdge] = []
+class Report(BaseModel):
+    id:                     str
+    session_id:             str
+    report_markdown:        str
+    executive_summary:      str = ""
+    trend_overview:         str = ""
+    methodology_comparison: str = ""
+    contradiction_summary:  str = ""
+    research_gaps:          List[ResearchGap] = []
+    suggested_directions:   List[str] = []
+    confidence_score:       float = 0.0
+    created_at:             str

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/council/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/council/council_runner.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+LLM Council: five role-based agents validate and enrich the synthesis.
+Roles: Extractor, Skeptic, Synthesizer, Contradiction Judge, Gap Prioritizer.
+"""
+from typing import Any
+from ...core.llm import chat_completion
+ROLES = {
+    "extractor": (
+        "You are the Extractor. Confirm what the papers claim: their methods, "
+        "results, and limitations. Be factual and concise."
+    ),
+    "skeptic": (
+        "You are the Skeptic. Identify weaknesses, unsupported conclusions, "
+        "potential confounders, and overgeneralizations in the paper claims."
+    ),
+    "synthesizer": (
+        "You are the Synthesizer. Identify cross-paper patterns, shared trends, "
+        "and broader field-level insights from the provided summaries."
+    ),
+    "contradiction_judge": (
+        "You are the Contradiction Judge. Assess whether two claims genuinely "
+        "conflict or merely differ in scope/setup."
+    ),
+    "gap_prioritizer": (
+        "You are the Gap Prioritizer. Rank the identified research gaps by "
+        "importance, feasibility, and potential impact."
+    ),
+}
+async def run_role(role: str, content: str) -> str:
+    system = ROLES.get(role, "You are a helpful research assistant.")
+    try:
+        return await chat_completion(system, content[:2000], temperature=0.2)
+    except Exception as e:
+        return f"[Council role '{role}' unavailable: {e}]"
+async def council_validate_report(report_markdown: str, papers_summary: str) -> dict[str, Any]:
+    extractor_out   = await run_role("extractor",   papers_summary)
+    skeptic_out     = await run_role("skeptic",     papers_summary)
+    synthesizer_out = await run_role("synthesizer", papers_summary)
+    return {
+        "extractor_notes":   extractor_out,
+        "skeptic_notes":     skeptic_out,
+        "synthesizer_notes": synthesizer_out,
+    }

app/services/extraction/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/extraction/contradiction_detector.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Detects contradictions between extracted claims using embeddings + LLM judgment.
+"""
+import uuid
+from typing import List
+from sentence_transformers import SentenceTransformer, util
+from ...schemas.research import Claim, Contradiction, ContradictionSeverity
+from ...core.llm import chat_json
+_model = None
+JUDGE_PROMPT = """You are a scientific contradiction detector.
+Given two claims from different papers, determine if they contradict each other.
+Return JSON with keys:
+- verdict: "contradicts" | "partially_contradicts" | "supports" | "inconclusive"
+- explanation: string (1-2 sentences)
+- reason: string (brief likely cause of conflict, if any)"""
+def _get_model():
+    global _model
+    if _model is None:
+        _model = SentenceTransformer("all-MiniLM-L6-v2")
+    return _model
+async def detect_contradictions(claims: List[Claim]) -> List[Contradiction]:
+    if len(claims) < 2:
+        return []
+    model = _get_model()
+    texts = [c.claim_text for c in claims]
+    embs  = model.encode(texts, convert_to_tensor=True)
+    sims  = util.cos_sim(embs, embs)
+    candidates: list[tuple[Claim, Claim]] = []
+    for i in range(len(claims)):
+        for j in range(i + 1, len(claims)):
+            if claims[i].paper_id == claims[j].paper_id:
+                continue
+            if float(sims[i][j]) > 0.55:
+                candidates.append((claims[i], claims[j]))
+    contradictions: List[Contradiction] = []
+    for ca, cb in candidates[:20]:
+        verdict_data = await _judge_pair(ca, cb)
+        if verdict_data.get("verdict") in ("contradicts", "partially_contradicts"):
+            severity = (
+                ContradictionSeverity.direct
+                if verdict_data["verdict"] == "contradicts"
+                else ContradictionSeverity.partial
+            )
+            contradictions.append(Contradiction(
+                id=str(uuid.uuid4()),
+                session_id="",
+                claim_a=ca,
+                claim_b=cb,
+                severity=severity,
+                explanation=verdict_data.get("explanation", ""),
+                reason=verdict_data.get("reason"),
+            ))
+    return contradictions
+async def _judge_pair(ca: Claim, cb: Claim) -> dict:
+    text = (
+        f"Claim A (from: {ca.paper_title}):\n{ca.claim_text}\n\n"
+        f"Claim B (from: {cb.paper_title}):\n{cb.claim_text}"
+    )
+    try:
+        return await chat_json(JUDGE_PROMPT, text)
+    except Exception:
+        return {"verdict": "inconclusive", "explanation": "", "reason": ""}

app/services/extraction/extractor.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Structured information extraction from paper text using LLM + Pydantic validation.
+"""
+from ...schemas.research import PaperExtraction
+from ...core.llm import chat_json
+SYSTEM_PROMPT = """You are a scientific paper analyst. Given the text of an academic paper,
+extract structured information and return it as valid JSON with these exact keys:
+- objective: string (the main research goal)
+- methodology: string (the approach/method used)
+- datasets: list of strings (dataset names used)
+- metrics: list of strings (evaluation metrics)
+- key_findings: list of strings (3-5 main results/conclusions)
+- limitations: list of strings (stated or implied limitations)
+- future_work: list of strings (suggested future directions)
+- summary: string (2-3 sentence plain English summary)
+Be concise and factual. Do not hallucinate."""
+async def extract_paper(title: str, abstract: str, sections: dict[str, str]) -> PaperExtraction:
+    text_parts = [f"Title: {title}", f"Abstract: {abstract}"]
+    for key in ["introduction", "methodology", "methods", "results", "conclusion", "limitations"]:
+        for section_name, content in sections.items():
+            if key in section_name.lower() and content:
+                text_parts.append(f"\n## {section_name.title()}\n{content[:1500]}")
+                break
+    text = "\n".join(text_parts)[:4000]
+    try:
+        data = await chat_json(SYSTEM_PROMPT, text)
+        return PaperExtraction(**{k: data.get(k, v) for k, v in PaperExtraction().model_dump().items()})
+    except Exception:
+        return PaperExtraction(
+            objective=abstract[:200],
+            summary=abstract[:300],
+        )

app/services/graph/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/graph/graph_builder.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Builds a knowledge graph from papers using networkx, serializes to JSON for frontend.
+"""
+import uuid
+import networkx as nx
+from typing import List
+from ...schemas.research import Paper, KnowledgeGraph, GraphNode, GraphEdge
+def build_graph(papers: List[Paper]) -> KnowledgeGraph:
+    G = nx.DiGraph()
+    nodes: List[GraphNode] = []
+    edges: List[GraphEdge] = []
+    seen_methods: dict[str, str] = {}
+    seen_datasets: dict[str, str] = {}
+    seen_topics: dict[str, str] = {}
+    for paper in papers:
+        # Paper node
+        G.add_node(paper.id, type="Paper", label=paper.title[:50])
+        nodes.append(GraphNode(id=paper.id, label=paper.title[:50], type="Paper"))
+        if not paper.extraction:
+            continue
+        # Method nodes
+        if paper.extraction.methodology:
+            method_key = paper.extraction.methodology[:40].lower()
+            if method_key not in seen_methods:
+                mid = str(uuid.uuid4())
+                seen_methods[method_key] = mid
+                G.add_node(mid, type="Method", label=paper.extraction.methodology[:40])
+                nodes.append(GraphNode(id=mid, label=paper.extraction.methodology[:40], type="Method"))
+            edges.append(GraphEdge(
+                id=str(uuid.uuid4()), source=paper.id,
+                target=seen_methods[method_key], type="USES",
+            ))
+        # Dataset nodes
+        for ds in (paper.extraction.datasets or [])[:3]:
+            dk = ds.lower()
+            if dk not in seen_datasets:
+                did = str(uuid.uuid4())
+                seen_datasets[dk] = did
+                G.add_node(did, type="Dataset", label=ds)
+                nodes.append(GraphNode(id=did, label=ds, type="Dataset"))
+            edges.append(GraphEdge(
+                id=str(uuid.uuid4()), source=paper.id,
+                target=seen_datasets[dk], type="TESTED_ON",
+            ))
+        # Author node (first author)
+        if paper.authors:
+            author = paper.authors[0]
+            aid    = f"author_{author.replace(' ', '_').lower()}"
+            if not G.has_node(aid):
+                G.add_node(aid, type="Author", label=author)
+                nodes.append(GraphNode(id=aid, label=author, type="Author"))
+            edges.append(GraphEdge(
+                id=str(uuid.uuid4()), source=paper.id,
+                target=aid, type="WRITTEN_BY",
+            ))
+    return KnowledgeGraph(nodes=nodes, edges=edges)

app/services/graph/neo4j_writer.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+Writes the knowledge graph to Neo4j AuraDB.
+Node labels:  Paper | Author | Method | Dataset | Claim | Topic
+Relationships: CITES | USES | TESTED_ON | SUPPORTS | CONTRADICTS | WRITTEN_BY | RELATED_TO
+"""
+from typing import List
+from ...schemas.research import Paper, Contradiction
+from ...core.neo4j_client import get_driver
+# ── Schema constraints (run once) ────────────────────────────────────────────
+CONSTRAINTS = [
+    "CREATE CONSTRAINT paper_id   IF NOT EXISTS FOR (p:Paper)   REQUIRE p.id   IS UNIQUE",
+    "CREATE CONSTRAINT author_name IF NOT EXISTS FOR (a:Author)  REQUIRE a.name IS UNIQUE",
+    "CREATE CONSTRAINT method_name IF NOT EXISTS FOR (m:Method)  REQUIRE m.name IS UNIQUE",
+    "CREATE CONSTRAINT dataset_name IF NOT EXISTS FOR (d:Dataset) REQUIRE d.name IS UNIQUE",
+]
+def ensure_constraints():
+    driver = get_driver()
+    with driver.session() as session:
+        for constraint in CONSTRAINTS:
+            try:
+                session.run(constraint)
+            except Exception:
+                pass  # Already exists
+# ── Write papers ─────────────────────────────────────────────────────────────
+def write_papers(papers: List[Paper], session_id: str):
+    driver = get_driver()
+    with driver.session() as session:
+        for paper in papers:
+            # Create Paper node
+            session.run(
+                """
+                MERGE (p:Paper {id: $id})
+                SET p.title        = $title,
+                    p.year         = $year,
+                    p.abstract     = $abstract,
+                    p.venue        = $venue,
+                    p.citationCount= $citation_count,
+                    p.sessionId    = $session_id,
+                    p.source       = $source
+                """,
+                id            = paper.id,
+                title         = paper.title,
+                year          = paper.year,
+                abstract      = (paper.abstract or "")[:500],
+                venue         = paper.venue or "",
+                citation_count= paper.citation_count or 0,
+                session_id    = session_id,
+                source        = paper.external_source,
+            )
+            # Author nodes + WRITTEN_BY edges
+            for author_name in (paper.authors or [])[:3]:
+                if not author_name:
+                    continue
+                session.run(
+                    """
+                    MERGE (a:Author {name: $name})
+                    WITH a
+                    MATCH (p:Paper {id: $paper_id})
+                    MERGE (p)-[:WRITTEN_BY]->(a)
+                    """,
+                    name=author_name, paper_id=paper.id,
+                )
+            if not paper.extraction:
+                continue
+            # Method node + USES edge
+            if paper.extraction.methodology:
+                method_name = paper.extraction.methodology[:80]
+                session.run(
+                    """
+                    MERGE (m:Method {name: $name})
+                    WITH m
+                    MATCH (p:Paper {id: $paper_id})
+                    MERGE (p)-[:USES]->(m)
+                    """,
+                    name=method_name, paper_id=paper.id,
+                )
+            # Dataset nodes + TESTED_ON edges
+            for ds in (paper.extraction.datasets or [])[:3]:
+                if not ds:
+                    continue
+                session.run(
+                    """
+                    MERGE (d:Dataset {name: $name})
+                    WITH d
+                    MATCH (p:Paper {id: $paper_id})
+                    MERGE (p)-[:TESTED_ON]->(d)
+                    """,
+                    name=ds, paper_id=paper.id,
+                )
+    print(f"[Neo4j] Wrote {len(papers)} papers to AuraDB.")
+# ── Write contradictions ─────────────────────────────────────────────────────
+def write_contradictions(contradictions: List[Contradiction]):
+    if not contradictions:
+        return
+    driver = get_driver()
+    with driver.session() as session:
+        for c in contradictions:
+            session.run(
+                """
+                MATCH (a:Paper {id: $paper_a})
+                MATCH (b:Paper {id: $paper_b})
+                MERGE (a)-[r:CONTRADICTS {id: $cid}]->(b)
+                SET r.severity    = $severity,
+                    r.explanation = $explanation
+                """,
+                paper_a    = c.claim_a.paper_id,
+                paper_b    = c.claim_b.paper_id,
+                cid        = c.id,
+                severity   = c.severity,
+                explanation= c.explanation[:300],
+            )
+    print(f"[Neo4j] Wrote {len(contradictions)} contradiction edges.")
+# ── Query helpers for the API ─────────────────────────────────────────────────
+def get_graph_for_session(session_id: str) -> dict:
+    """Return nodes + edges for a session as a dict compatible with the frontend."""
+    driver = get_driver()
+    with driver.session() as session:
+        # Nodes
+        node_result = session.run(
+            """
+            MATCH (p:Paper {sessionId: $sid})
+            OPTIONAL MATCH (p)-[:WRITTEN_BY]->(a:Author)
+            OPTIONAL MATCH (p)-[:USES]->(m:Method)
+            OPTIONAL MATCH (p)-[:TESTED_ON]->(d:Dataset)
+            RETURN p, collect(DISTINCT a) AS authors,
+                      collect(DISTINCT m) AS methods,
+                      collect(DISTINCT d) AS datasets
+            """,
+            sid=session_id,
+        )
+        nodes = []
+        edges = []
+        seen_node_ids: set[str] = set()
+        for record in node_result:
+            p = record["p"]
+            if p["id"] not in seen_node_ids:
+                seen_node_ids.add(p["id"])
+                nodes.append({"id": p["id"], "label": (p.get("title") or "")[:50], "type": "Paper"})
+            for author in record["authors"]:
+                if author is None:
+                    continue
+                aid = f"author_{author['name']}"
+                if aid not in seen_node_ids:
+                    seen_node_ids.add(aid)
+                    nodes.append({"id": aid, "label": author["name"], "type": "Author"})
+                edges.append({"id": f"e_{p['id']}_{aid}", "source": p["id"], "target": aid, "type": "WRITTEN_BY"})
+            for method in record["methods"]:
+                if method is None:
+                    continue
+                mid = f"method_{method['name'][:30]}"
+                if mid not in seen_node_ids:
+                    seen_node_ids.add(mid)
+                    nodes.append({"id": mid, "label": method["name"][:40], "type": "Method"})
+                edges.append({"id": f"e_{p['id']}_{mid}", "source": p["id"], "target": mid, "type": "USES"})
+            for dataset in record["datasets"]:
+                if dataset is None:
+                    continue
+                did = f"dataset_{dataset['name'][:30]}"
+                if did not in seen_node_ids:
+                    seen_node_ids.add(did)
+                    nodes.append({"id": did, "label": dataset["name"], "type": "Dataset"})
+                edges.append({"id": f"e_{p['id']}_{did}", "source": p["id"], "target": did, "type": "TESTED_ON"})
+        # Contradiction edges
+        contra_result = session.run(
+            """
+            MATCH (a:Paper {sessionId: $sid})-[r:CONTRADICTS]->(b:Paper {sessionId: $sid})
+            RETURN a.id AS src, b.id AS tgt, r.id AS rid, r.severity AS severity
+            """,
+            sid=session_id,
+        )
+        for record in contra_result:
+            edges.append({
+                "id":     record["rid"],
+                "source": record["src"],
+                "target": record["tgt"],
+                "type":   "CONTRADICTS",
+                "label":  record["severity"],
+            })
+    return {"nodes": nodes, "edges": edges}

app/services/parsing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/parsing/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Downloads and parses PDFs.
+Primary: PyMuPDF (fitz)  |  Fallback: pdfplumber
+"""
+import fitz  # PyMuPDF
+import pdfplumber
+import httpx
+import tempfile
+import os
+from typing import Optional
+SECTION_KEYWORDS = ["abstract", "introduction", "related work", "background",
+                    "methodology", "methods", "experiments", "results",
+                    "discussion", "conclusion", "references", "limitations"]
+def download_pdf(url: str) -> Optional[bytes]:
+    try:
+        with httpx.Client(timeout=30, follow_redirects=True) as client:
+            resp = client.get(url, headers={"User-Agent": "LitAgent/1.0"})
+            if resp.status_code == 200 and "pdf" in resp.headers.get("content-type", ""):
+                return resp.content
+    except Exception:
+        pass
+    return None
+def extract_text_pymupdf(pdf_bytes: bytes) -> dict[str, str]:
+    """Returns dict of section_name -> text."""
+    sections: dict[str, str] = {}
+    current_section = "body"
+    buffer: list[str] = []
+    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text("text") + "\n"
+    for line in full_text.split("\n"):
+        lowered = line.strip().lower()
+        if any(lowered.startswith(kw) for kw in SECTION_KEYWORDS):
+            if buffer:
+                sections[current_section] = "\n".join(buffer).strip()
+                buffer = []
+            current_section = lowered[:40]
+        else:
+            buffer.append(line)
+    if buffer:
+        sections[current_section] = "\n".join(buffer).strip()
+    return sections
+def extract_text_pdfplumber(pdf_bytes: bytes) -> dict[str, str]:
+    """Fallback parser."""
+    text = ""
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+        f.write(pdf_bytes)
+        tmp_path = f.name
+    try:
+        with pdfplumber.open(tmp_path) as pdf:
+            for page in pdf.pages:
+                text += (page.extract_text() or "") + "\n"
+    finally:
+        os.unlink(tmp_path)
+    return {"body": text}
+def parse_pdf(url: str) -> Optional[dict[str, str]]:
+    """Main entry point — returns section dict or None if unavailable."""
+    pdf_bytes = download_pdf(url)
+    if not pdf_bytes:
+        return None
+    try:
+        sections = extract_text_pymupdf(pdf_bytes)
+        if len(sections) >= 2:
+            return sections
+        return extract_text_pdfplumber(pdf_bytes)
+    except Exception:
+        try:
+            return extract_text_pdfplumber(pdf_bytes)
+        except Exception:
+            return None

app/services/reporting/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/reporting/report_generator.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Generates the final literature review report using LLM synthesis.
+"""
+from typing import List
+from ...schemas.research import Paper, Report, ResearchGap
+from ...core.llm import chat_completion, chat_json
+from datetime import datetime, timezone
+import uuid
+SYNTHESIS_PROMPT = """You are an expert academic research synthesizer. Given a list of paper summaries,
+generate a comprehensive literature review report in Markdown format with these sections:
+1. Executive Summary (3-5 sentences)
+2. Trends in Literature
+3. Methodology Comparison
+4. Major Findings
+5. Identified Contradictions
+6. Research Gaps
+7. Suggested Future Directions
+8. References
+Be analytical, cite papers by [Author, Year], and provide a confidence score (0-1) at the end as:
+CONFIDENCE: 0.XX"""
+GAP_PROMPT = """Given these paper limitations and contradictions, identify 3-5 distinct research gaps.
+Return a JSON object with key "gaps" containing an array of objects:
+{title, description, category, recurrence_count, importance_score, derivation_note}
+Categories: missing_population, missing_replication, inconsistent_benchmarks, data_scarcity, interpretability, fairness, other"""
+async def generate_report(query: str, papers: List[Paper], session_id: str) -> Report:
+    paper_texts = []
+    for p in papers[:20]:
+        summary = f"[{', '.join(p.authors[:2])}, {p.year}] {p.title}"
+        if p.extraction:
+            summary += f"\n  Objective: {p.extraction.objective}"
+            summary += f"\n  Method: {p.extraction.methodology[:100]}"
+            if p.extraction.key_findings:
+                summary += f"\n  Findings: {'; '.join(p.extraction.key_findings[:2])}"
+            if p.extraction.limitations:
+                summary += f"\n  Limitations: {'; '.join(p.extraction.limitations[:2])}"
+        paper_texts.append(summary)
+    context = f"Research Query: {query}\n\nPapers:\n" + "\n\n".join(paper_texts)
+    report_markdown = ""
+    confidence_score = 0.7
+    gaps: List[ResearchGap] = []
+    try:
+        report_markdown = await chat_completion(SYNTHESIS_PROMPT, context, temperature=0.3)
+        if "CONFIDENCE:" in report_markdown:
+            try:
+                line = [l for l in report_markdown.split("\n") if "CONFIDENCE:" in l][0]
+                confidence_score = float(line.split(":")[1].strip())
+            except Exception:
+                pass
+        lim_context = "\n".join(
+            f"- [{p.title[:40]}]: {'; '.join((p.extraction.limitations or [])[:2])}"
+            for p in papers if p.extraction and p.extraction.limitations
+        )[:2000]
+        if lim_context:
+            gap_data = await chat_json(GAP_PROMPT, lim_context)
+            raw_gaps = gap_data.get("gaps", gap_data) if isinstance(gap_data, dict) else gap_data
+            if isinstance(raw_gaps, list):
+                gaps = [
+                    ResearchGap(id=str(uuid.uuid4()), session_id=session_id, **g)
+                    for g in raw_gaps
+                ]
+    except Exception as e:
+        report_markdown = (
+            f"# Literature Review: {query}\n\n"
+            f"*Report generation encountered an error: {e}*\n\n"
+            + "\n\n".join(f"- **{p.title}** ({p.year})" for p in papers[:10])
+        )
+    return Report(
+        id=str(uuid.uuid4()),
+        session_id=session_id,
+        report_markdown=report_markdown,
+        research_gaps=gaps,
+        confidence_score=confidence_score,
+        created_at=datetime.now(timezone.utc).isoformat(),
+    )

app/services/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/services/retrieval/arxiv_adapter.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""arXiv paper retrieval adapter."""
+import arxiv
+from typing import List
+from ...schemas.research import Paper, PaperSource
+import uuid
+def search(query: str, session_id: str, max_results: int = 15) -> List[Paper]:
+    client = arxiv.Client()
+    search_obj = arxiv.Search(
+        query=query,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.Relevance,
+    )
+    papers = []
+    for r in client.results(search_obj):
+        papers.append(Paper(
+            id=str(uuid.uuid4()),
+            session_id=session_id,
+            external_source=PaperSource.arxiv,
+            source_paper_id=r.entry_id,
+            title=r.title,
+            authors=[a.name for a in r.authors],
+            year=r.published.year if r.published else None,
+            abstract=r.summary,
+            doi=r.doi,
+            venue="arXiv",
+            pdf_url=r.pdf_url,
+        ))
+    return papers

app/services/retrieval/query_decomposer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Decomposes a raw research query into a structured search plan using LLM.
+"""
+from pydantic import BaseModel
+from typing import List
+from ...core.llm import chat_json
+class QueryPlan(BaseModel):
+    main_topic:     str
+    subtopics:      List[str]
+    filters:        dict
+    query_variants: List[str]
+SYSTEM_PROMPT = """You are a research librarian. Given a user's research question,
+decompose it into a structured search plan. Return valid JSON only with these keys:
+- main_topic: string
+- subtopics: list of strings
+- filters: {"year_from": int or null}
+- query_variants: list of 3-5 search strings for different academic databases"""
+async def decompose_query(query: str) -> QueryPlan:
+    try:
+        data = await chat_json(SYSTEM_PROMPT, query)
+        return QueryPlan(**data)
+    except Exception:
+        return QueryPlan(
+            main_topic=query[:80],
+            subtopics=[],
+            filters={},
+            query_variants=[query],
+        )

app/services/retrieval/ranker.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Scores and ranks papers by relevance to the original query.
+Uses: semantic similarity (sentence-transformers) + citation count + recency.
+"""
+from typing import List
+from ...schemas.research import Paper
+from sentence_transformers import SentenceTransformer, util
+import torch
+_model: SentenceTransformer | None = None
+def _get_model() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        _model = SentenceTransformer("all-MiniLM-L6-v2")
+    return _model
+def rank_papers(query: str, papers: List[Paper], top_k: int = 20) -> List[Paper]:
+    if not papers:
+        return []
+    model  = _get_model()
+    texts  = [f"{p.title}. {p.abstract[:300]}" for p in papers]
+    q_emb  = model.encode(query, convert_to_tensor=True)
+    p_embs = model.encode(texts,  convert_to_tensor=True)
+    sem_scores = util.cos_sim(q_emb, p_embs)[0].tolist()
+    # Normalise citation counts
+    max_citations = max((p.citation_count or 0 for p in papers), default=1) or 1
+    current_year  = 2025
+    scored = []
+    for i, paper in enumerate(papers):
+        sem   = sem_scores[i]
+        cite  = (paper.citation_count or 0) / max_citations
+        rec   = max(0, 1 - (current_year - (paper.year or current_year)) / 10)
+        score = 0.6 * sem + 0.25 * cite + 0.15 * rec
+        paper.relevance_score = round(float(score), 4)
+        scored.append(paper)
+    scored.sort(key=lambda p: p.relevance_score or 0, reverse=True)
+    return scored[:top_k]

app/services/retrieval/semantic_scholar_adapter.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Semantic Scholar paper retrieval adapter."""
+import httpx
+from typing import List
+from ...schemas.research import Paper, PaperSource
+from ...core.config import get_settings
+import uuid
+import time
+BASE   = "https://api.semanticscholar.org/graph/v1"
+FIELDS = "paperId,title,authors,year,abstract,citationCount,venue,externalIds,openAccessPdf"
+def _headers() -> dict:
+    key = get_settings().semantic_scholar_api_key
+    return {"x-api-key": key} if key else {}
+def search(query: str, session_id: str, limit: int = 10) -> List[Paper]:
+    try:
+        with httpx.Client(timeout=15) as client:
+            resp = client.get(
+                f"{BASE}/paper/search",
+                params={"query": query, "limit": limit, "fields": FIELDS},
+                headers=_headers(),
+            )
+            if resp.status_code == 429:
+                print("[SemanticScholar] Rate limited — skipping, using arXiv only.")
+                return []
+            if resp.status_code != 200:
+                print(f"[SemanticScholar] search returned {resp.status_code} — skipping.")
+                return []
+            data = resp.json().get("data", []) or []
+    except Exception as e:
+        print(f"[SemanticScholar] search error: {e}")
+        return []
+    papers = []
+    for item in data:
+        try:
+            papers.append(Paper(
+                id=str(uuid.uuid4()),
+                session_id=session_id,
+                external_source=PaperSource.semantic_scholar,
+                source_paper_id=item.get("paperId", ""),
+                title=item.get("title") or "",
+                authors=[a.get("name", "") for a in (item.get("authors") or [])],
+                year=item.get("year"),
+                abstract=item.get("abstract") or "",
+                doi=item.get("externalIds", {}).get("DOI"),
+                venue=item.get("venue"),
+                citation_count=item.get("citationCount"),
+                pdf_url=(item.get("openAccessPdf") or {}).get("url"),
+            ))
+        except Exception:
+            continue
+    return papers
+def get_references(paper_id: str, session_id: str, limit: int = 10) -> List[Paper]:
+    if not paper_id:
+        return []
+    try:
+        time.sleep(1)  # Be polite to avoid 429
+        with httpx.Client(timeout=15) as client:
+            resp = client.get(
+                f"{BASE}/paper/{paper_id}/references",
+                params={"limit": limit, "fields": FIELDS},
+                headers=_headers(),
+            )
+            if resp.status_code != 200:
+                return []
+            data = resp.json().get("data", []) or []
+    except Exception as e:
+        print(f"[SemanticScholar] get_references error: {e}")
+        return []
+    results = []
+    for item in (data or []):
+        cited = item.get("citedPaper") or {}
+        if not cited.get("title"):
+            continue
+        try:
+            results.append(Paper(
+                id=str(uuid.uuid4()),
+                session_id=session_id,
+                external_source=PaperSource.semantic_scholar,
+                source_paper_id=cited.get("paperId", ""),
+                title=cited.get("title", ""),
+                authors=[a.get("name", "") for a in (cited.get("authors") or [])],
+                year=cited.get("year"),
+                abstract=cited.get("abstract") or "",
+            ))
+        except Exception:
+            continue
+    return results

app/workers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/workers/pipeline.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""
+Research pipeline using FastAPI BackgroundTasks (no Redis/Celery required).
+"""
+import asyncio
+import uuid
+from datetime import datetime, timezone
+def _update_status(db, session_id: str, status: str, extra: dict = {}):
+    try:
+        db.collection("research_sessions").document(session_id).update(
+            {"status": status, **extra}
+        )
+        print(f"[Pipeline] {session_id[:8]}... → {status}")
+    except Exception as e:
+        print(f"[Pipeline] Failed to update status: {e}")
+async def run_research_pipeline(session_id: str, query: str, user_id: str):
+    """Full async pipeline — called as a FastAPI background task."""
+    from ..core.firebase import get_db, init_firebase
+    from ..services.retrieval.query_decomposer import decompose_query
+    from ..services.retrieval import arxiv_adapter, semantic_scholar_adapter
+    from ..services.retrieval.ranker import rank_papers
+    from ..services.parsing.pdf_parser import parse_pdf
+    from ..services.extraction.extractor import extract_paper
+    from ..services.extraction.contradiction_detector import detect_contradictions
+    from ..services.graph.graph_builder import build_graph
+    from ..services.reporting.report_generator import generate_report
+    from ..services.council.council_runner import council_validate_report
+    from ..schemas.research import Claim, ClaimType
+    from ..core.config import get_settings
+    init_firebase()
+    db       = get_db()
+    settings = get_settings()
+    try:
+        # ── Stage 1: Retrieve papers ──────────────────────────────────────────
+        _update_status(db, session_id, "retrieving")
+        query_plan = await decompose_query(query)
+        all_papers = []
+        for variant in (query_plan.query_variants or [query])[:2]:
+            # arXiv — always reliable
+            try:
+                all_papers += arxiv_adapter.search(variant, session_id, max_results=10)
+                print(f"[arXiv] fetched {len(all_papers)} papers for: {variant[:50]}")
+            except Exception as e:
+                print(f"[arXiv] error: {e}")
+            # Semantic Scholar — skip gracefully if rate-limited
+            try:
+                ss = semantic_scholar_adapter.search(variant, session_id, limit=8)
+                all_papers += ss
+                print(f"[S2] fetched {len(ss)} papers")
+            except Exception as e:
+                print(f"[S2] error: {e}")
+        # Deduplicate by title
+        seen: set[str] = set()
+        unique_papers = []
+        for p in all_papers:
+            key = (p.title or "").lower()[:60]
+            if key and key not in seen:
+                seen.add(key)
+                unique_papers.append(p)
+        print(f"[Pipeline] {len(unique_papers)} unique papers after dedup")
+        if not unique_papers:
+            _update_status(db, session_id, "failed", {"error": "No papers found. Try a different query."})
+            return
+        # ── Stage 2: Rank ─────────────────────────────────────────────────────
+        _update_status(db, session_id, "ranking")
+        try:
+            ranked = rank_papers(query, unique_papers, top_k=20)
+        except Exception as e:
+            print(f"[Ranker] error: {e} — using unranked")
+            ranked = unique_papers[:20]
+        # ── Stage 3: Citation expansion ───────────────────────────────────────
+        _update_status(db, session_id, "expanding_citations")
+        for paper in ranked[:3]:
+            if paper.external_source == "semantic_scholar" and paper.source_paper_id:
+                try:
+                    extras = semantic_scholar_adapter.get_references(
+                        paper.source_paper_id, session_id, limit=5
+                    ) or []
+                    for ep in extras:
+                        key = (ep.title or "").lower()[:60]
+                        if key and key not in seen:
+                            seen.add(key)
+                            ranked.append(ep)
+                except Exception as e:
+                    print(f"[Citations] error: {e}")
+        final_papers = ranked[:settings.max_papers_per_session]
+        _update_status(db, session_id, "extracting", {"paperCount": len(final_papers)})
+        # Save basic paper metadata immediately so UI can show them
+        for paper in final_papers:
+            try:
+                db.collection("papers").document(paper.id).set({
+                    **paper.model_dump(),
+                    "sessionId": session_id,
+                })
+            except Exception as e:
+                print(f"[Firestore] paper save error: {e}")
+        # ── Stage 4: PDF Parsing ────────────────────────────���─────────────────
+        _update_status(db, session_id, "parsing_pdfs")
+        paper_sections: dict[str, dict] = {}
+        for paper in final_papers:
+            if paper.pdf_url:
+                try:
+                    sections = parse_pdf(paper.pdf_url)
+                    paper_sections[paper.id] = sections or {}
+                except Exception as e:
+                    print(f"[PDF] parse error for {paper.title[:40]}: {e}")
+                    paper_sections[paper.id] = {}
+        # ── Stage 5: LLM Extraction ───────────────────────────────────────────
+        _update_status(db, session_id, "extracting")
+        for paper in final_papers:
+            try:
+                sections = paper_sections.get(paper.id, {})
+                paper.extraction = await extract_paper(paper.title, paper.abstract, sections)
+                db.collection("papers").document(paper.id).update(
+                    {"extraction": paper.extraction.model_dump()}
+                )
+            except Exception as e:
+                print(f"[Extract] error for {paper.title[:40]}: {e}")
+        # ── Stage 6: Graph + Contradictions ───────────────────────────────────
+        _update_status(db, session_id, "building_graph")
+        claims: list[Claim] = []
+        for paper in final_papers:
+            if paper.extraction and paper.extraction.key_findings:
+                for finding in paper.extraction.key_findings[:2]:
+                    claims.append(Claim(
+                        id=str(uuid.uuid4()),
+                        paper_id=paper.id,
+                        paper_title=paper.title,
+                        claim_text=finding,
+                        claim_type=ClaimType.empirical,
+                        evidence_span=finding,
+                        confidence=0.75,
+                    ))
+        try:
+            contradictions = await detect_contradictions(claims)
+            for c in contradictions:
+                c.session_id = session_id
+                db.collection("contradictions").document(c.id).set(c.model_dump())
+        except Exception as e:
+            print(f"[Contradictions] error: {e}")
+            contradictions = []
+        try:
+            graph = build_graph(final_papers)
+            db.collection("graphs").document(session_id).set(graph.model_dump())
+            # ── Persist to Neo4j AuraDB ───────────────────────────────────────
+            try:
+                from ..core.neo4j_client import get_driver
+                from ..services.graph.neo4j_writer import (
+                    ensure_constraints, write_papers, write_contradictions,
+                )
+                ensure_constraints()
+                write_papers(final_papers, session_id)
+                write_contradictions(contradictions)
+                print(f"[Neo4j] Knowledge graph persisted for session {session_id}.")
+            except Exception as neo4j_err:
+                print(f"[Neo4j] Non-fatal — could not write to AuraDB: {neo4j_err}")
+        except Exception as e:
+            print(f"[Graph] error: {e}")
+        # ── Stage 7: Report ───────────────────────────────────────────────────
+        _update_status(db, session_id, "generating_report")
+        try:
+            report = await generate_report(query, final_papers, session_id)
+        except Exception as e:
+            print(f"[Report] error: {e}")
+            from ..schemas.research import Report
+            report = Report(
+                id=str(uuid.uuid4()),
+                session_id=session_id,
+                report_markdown=f"# Literature Review: {query}\n\nReport generation failed: {e}",
+                confidence_score=0.0,
+                created_at=datetime.now(timezone.utc).isoformat(),
+            )
+        # ── Stage 8: LLM Council ──────────────────────────────────────────────
+        _update_status(db, session_id, "council_review")
+        try:
+            papers_summary = "\n".join(
+                f"- {p.title}: {p.extraction.summary if p.extraction else ''}"
+                for p in final_papers[:8]
+            )
+            council_notes = await council_validate_report(report.report_markdown, papers_summary)
+            report.report_markdown += (
+                "\n\n---\n## LLM Council Notes\n"
+                f"**Extractor:** {council_notes['extractor_notes'][:300]}\n\n"
+                f"**Skeptic:** {council_notes['skeptic_notes'][:300]}\n\n"
+                f"**Synthesizer:** {council_notes['synthesizer_notes'][:300]}"
+            )
+        except Exception as e:
+            print(f"[Council] error: {e}")
+        db.collection("reports").document(report.id).set(report.model_dump())
+        # ── Done ──────────────────────────────────────────────────────────────
+        _update_status(db, session_id, "completed", {
+            "completedAt": datetime.now(timezone.utc).isoformat(),
+            "paperCount":  len(final_papers),
+        })
+        print(f"[Pipeline] COMPLETE — {len(final_papers)} papers, session {session_id[:8]}")
+    except Exception as exc:
+        print(f"[Pipeline] FATAL: {exc}")
+        try:
+            _update_status(db, session_id, "failed", {"error": str(exc)})
+        except Exception:
+            pass
+        raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+# Web framework
+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+python-multipart>=0.0.12
+# Firebase Admin (replaces PostgreSQL)
+firebase-admin>=6.6.0
+# Background tasks (FastAPI BackgroundTasks — no Redis required)
+# LLM adapters
+groq>=0.13.0
+openai>=1.58.0
+anthropic>=0.40.0
+google-generativeai>=0.8.3
+# PDF parsing
+PyMuPDF>=1.24.14
+pdfplumber>=0.11.4
+# NLP / Embeddings
+sentence-transformers>=3.3.1
+scikit-learn>=1.5.2
+spacy>=3.8.3
+# Graph
+networkx>=3.4.2
+# Academic APIs
+arxiv>=2.1.3
+requests>=2.32.3
+httpx>=0.28.1
+# Validation
+pydantic>=2.10.3
+pydantic-settings>=2.6.1
+# Utilities
+python-dotenv>=1.0.1
+tenacity>=9.0.0