Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

App Files Files Community

Abu-Sameer-66 commited on 24 days ago

Commit

a0aeb5a

1 Parent(s): 0295edd

feat: add PDF upload endpoint with 14-module analysis and security hardening

Browse files

Files changed (2) hide show

src/scipeerai/api/routes.py +315 -2
src/scipeerai/core/pdf_parser.py +93 -16

src/scipeerai/api/routes.py CHANGED Viewed

@@ -750,7 +750,7 @@ from src.scipeerai.modules.effect_size_validator import EffectSizeValidator
 from src.scipeerai.modules.retraction_checker import RetractionChecker
 from src.scipeerai.modules.citation_cartel import CitationCartelDetector
 from src.scipeerai.modules.llm_detector import LLMDetector
 router = APIRouter(prefix="/api/v1", tags=["Analysis"])
 # ── Section-aware text extraction — replaces flat truncation ──────────────────
@@ -897,7 +897,7 @@ _effect_size_engine = EffectSizeValidator()
 _retraction_engine  = RetractionChecker()
 _cartel_engine      = CitationCartelDetector()
 _llm_engine         = LLMDetector()
 # ── Request / Response Models ─────────────────────────────────────────────────
@@ -1620,5 +1620,318 @@ def analyze_llm(request: LLMRequest):
             ],
             flags_count = result.flags_count,
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 from src.scipeerai.modules.retraction_checker import RetractionChecker
 from src.scipeerai.modules.citation_cartel import CitationCartelDetector
 from src.scipeerai.modules.llm_detector import LLMDetector
+from src.scipeerai.core.pdf_parser import PDFParser
 router = APIRouter(prefix="/api/v1", tags=["Analysis"])
 # ── Section-aware text extraction — replaces flat truncation ──────────────────
 _retraction_engine  = RetractionChecker()
 _cartel_engine      = CitationCartelDetector()
 _llm_engine         = LLMDetector()
+_pdf_parser = PDFParser()
 # ── Request / Response Models ─────────────────────────────────────────────────
             ],
             flags_count = result.flags_count,
         )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ── Full PDF Analysis — Master Endpoint ──────────────────────────────────────
+class ModuleSummary(BaseModel):
+    module:     str
+    risk_level: str
+    risk_score: float
+    summary:    str
+    flags_count: int
+class FullPDFResponse(BaseModel):
+    paper_title:       str
+    page_count:        int
+    figure_count:      int
+    file_size_kb:      float
+    sha256:            str
+    overall_score:     float
+    overall_risk:      str
+    integrity_verdict: str
+    modules:           list[ModuleSummary]
+    top_flags:         list[str]
+    analyzed_by:       str
+def _compute_overall(scores: list[float]) -> tuple[float, str]:
+    avg = round(sum(scores) / len(scores), 3) if scores else 0.0
+    if avg >= 0.7:
+        level = "HIGH"
+    elif avg >= 0.4:
+        level = "MEDIUM"
+    else:
+        level = "LOW"
+    return avg, level
+def _verdict(risk: str) -> str:
+    return {
+        "HIGH":   "Serious integrity concerns detected. Manual expert review strongly recommended.",
+        "MEDIUM": "Some integrity issues found. Careful review advised before publication.",
+        "LOW":    "No major integrity issues detected. Paper appears scientifically sound.",
+    }.get(risk, "Unknown")
+@router.post("/analyze/full-pdf", response_model=FullPDFResponse)
+async def analyze_full_pdf(file: UploadFile = File(...)):
+    """
+    Master endpoint — Upload a PDF and run all 14 analysis modules at once.
+    Returns a unified integrity report with per-module scores and top flags.
+    Designed for PhD researchers who want a single comprehensive analysis.
+    """
+    try:
+        file_bytes = await file.read()
+        paper      = _pdf_parser.parse_bytes(file_bytes, file.filename)
+        text       = paper.full_text
+        if len(text.strip()) < 100:
+            raise HTTPException(
+                status_code=422,
+                detail="PDF text extraction failed or paper is too short. "
+                       "Ensure the PDF contains selectable text (not a scanned image)."
+            )
+        modules_run = []
+        top_flags   = []
+        scores      = []
+        # ── Module 1: Statistical Audit ───────────────────────────
+        try:
+            r = _stat_engine.analyze(_smart_text(text, "statistics"))
+            modules_run.append(ModuleSummary(
+                module="Statistical Audit",
+                risk_level=r.risk_level,
+                risk_score=r.risk_score,
+                summary=r.summary,
+                flags_count=len(r.flags),
+            ))
+            scores.append(r.risk_score)
+            for f in r.flags[:2]:
+                top_flags.append(f"[Statistics] {f.description}")
+        except Exception:
+            pass
+        # ── Module 2: Methodology Checker ─────────────────────────
+        try:
+            abstract = paper.sections.get("abstract", "")
+            r = _method_engine.analyze(_smart_text(text, "methodology"), abstract)
+            score = 0.7 if len(r.flags) > 2 else 0.3 if r.flags else 0.1
+            modules_run.append(ModuleSummary(
+                module="Methodology Checker",
+                risk_level="HIGH" if score >= 0.7 else "MEDIUM" if score >= 0.4 else "LOW",
+                risk_score=score,
+                summary=r.llm_assessment or f"{len(r.flags)} methodology issues found.",
+                flags_count=len(r.flags),
+            ))
+            scores.append(score)
+            for f in r.flags[:2]:
+                top_flags.append(f"[Methodology] {f.issue}")
+        except Exception:
+            pass
+        # ── Module 3: Citation Integrity ────────────────────────���─
+        try:
+            r = _citation_engine.analyze(_smart_text(text, "citations"), "")
+            modules_run.append(ModuleSummary(
+                module="Citation Integrity",
+                risk_level=r.risk_level,
+                risk_score=r.risk_score,
+                summary=r.summary,
+                flags_count=len(r.flags),
+            ))
+            scores.append(r.risk_score)
+            for f in r.flags[:2]:
+                top_flags.append(f"[Citations] {f.description}")
+        except Exception:
+            pass
+        # ── Module 4: Reproducibility ─────────────────────────────
+        try:
+            r = _repro_engine.analyze(_smart_text(text, "reproducibility"))
+            modules_run.append(ModuleSummary(
+                module="Reproducibility Scanner",
+                risk_level=r.risk_level,
+                risk_score=1.0 - r.reproducibility_score,
+                summary=r.summary,
+                flags_count=len(r.flags),
+            ))
+            scores.append(1.0 - r.reproducibility_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[Reproducibility] {f.description}")
+        except Exception:
+            pass
+        # ── Module 5: Novelty ─────────────────────────────────────
+        try:
+            r = _novelty_engine.analyze(
+                _smart_text(text, "novelty", per_section_limit=2000),
+                paper.title,
+            )
+            modules_run.append(ModuleSummary(
+                module="Novelty Scorer",
+                risk_level=r.risk_level,
+                risk_score=getattr(r, "risk_score", 1.0 - r.novelty_score),
+                summary=r.summary,
+                flags_count=len(getattr(r, "flags", []) or []),
+            ))
+            scores.append(getattr(r, "risk_score", 1.0 - r.novelty_score))
+        except Exception:
+            pass
+        # ── Module 6: GRIM Test ───────────────────────────────────
+        try:
+            r = _grim_engine.analyze(_smart_text(text, "grim"))
+            modules_run.append(ModuleSummary(
+                module="GRIM Test",
+                risk_level=r.risk_level,
+                risk_score=r.grim_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.grim_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[GRIM] {f.description}")
+        except Exception:
+            pass
+        # ── Module 7: SPRITE Test ─────────────────────────────────
+        try:
+            r = _sprite_engine.analyze(_smart_text(text, "sprite"))
+            modules_run.append(ModuleSummary(
+                module="SPRITE Test",
+                risk_level=r.risk_level,
+                risk_score=r.sprite_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.sprite_score)
+        except Exception:
+            pass
+        # ── Module 8: Granularity ─────────────────────────────────
+        try:
+            r = _granularity_engine.analyze(_smart_text(text, "granularity"))
+            modules_run.append(ModuleSummary(
+                module="Granularity Analyzer",
+                risk_level=r.risk_level,
+                risk_score=r.granularity_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.granularity_score)
+        except Exception:
+            pass
+        # ── Module 9: P-Curve ─────────────────────────────────────
+        try:
+            r = _pcurve_engine.analyze(_smart_text(text, "pcurve"))
+            modules_run.append(ModuleSummary(
+                module="P-Curve Analyzer",
+                risk_level=r.risk_level,
+                risk_score=r.pcurve_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.pcurve_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[P-Curve] {f.description}")
+        except Exception:
+            pass
+        # ── Module 10: Effect Size ────────────────────────────────
+        try:
+            r = _effect_size_engine.analyze(_smart_text(text, "effect_size"))
+            modules_run.append(ModuleSummary(
+                module="Effect Size Validator",
+                risk_level=r.risk_level,
+                risk_score=r.effect_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.effect_score)
+        except Exception:
+            pass
+        # ── Module 11: Retraction Checker ─────────────────────────
+        try:
+            r = _retraction_engine.analyze(_smart_text(text, "retraction"))
+            modules_run.append(ModuleSummary(
+                module="Retraction Checker",
+                risk_level=r.risk_level,
+                risk_score=r.retraction_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.retraction_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[Retraction] {f.description}")
+        except Exception:
+            pass
+        # ── Module 12: Citation Cartel ────────────────────────────
+        try:
+            r = _cartel_engine.analyze(_smart_text(text, "cartel"))
+            modules_run.append(ModuleSummary(
+                module="Citation Cartel Detector",
+                risk_level=r.risk_level,
+                risk_score=r.cartel_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.cartel_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[Cartel] {f.description}")
+        except Exception:
+            pass
+        # ── Module 13: LLM Detector ───────────────────────────────
+        try:
+            r = _llm_engine.analyze(_smart_text(text, "llm"))
+            modules_run.append(ModuleSummary(
+                module="LLM Paper Detector",
+                risk_level=r.risk_level,
+                risk_score=r.llm_score,
+                summary=r.summary,
+                flags_count=r.flags_count,
+            ))
+            scores.append(r.llm_score)
+            for f in r.flags[:1]:
+                top_flags.append(f"[LLM] {f.description}")
+        except Exception:
+            pass
+        # ── Module 14: Figure Forensics ───────────────────────────
+        try:
+            tmp_path = None
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+                tmp.write(file_bytes)
+                tmp_path = tmp.name
+            r = _figure_engine.analyze(tmp_path)
+            fig_score = min(len(r.duplicate_pairs) * 0.3, 1.0)
+            modules_run.append(ModuleSummary(
+                module="Figure Forensics",
+                risk_level="HIGH" if fig_score >= 0.7 else "MEDIUM" if fig_score >= 0.3 else "LOW",
+                risk_score=fig_score,
+                summary=f"{r.figures_found} figures found. {len(r.duplicate_pairs)} duplicate pairs detected.",
+                flags_count=len(r.flags),
+            ))
+            scores.append(fig_score)
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+        except Exception:
+            pass
+        # ── Final Score ───────────────────────────────────────────
+        overall_score, overall_risk = _compute_overall(scores)
+        return FullPDFResponse(
+            paper_title       = paper.title,
+            page_count        = paper.page_count,
+            figure_count      = paper.figure_count,
+            file_size_kb      = paper.metadata.get("file_size_kb", 0.0),
+            sha256            = paper.metadata.get("sha256", ""),
+            overall_score     = overall_score,
+            overall_risk      = overall_risk,
+            integrity_verdict = _verdict(overall_risk),
+            modules           = modules_run,
+            top_flags         = top_flags[:10],
+            analyzed_by       = "SciPeerAI v1.5.0 — 14-Module Pipeline",
+        )
+    except HTTPException:
+        raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

src/scipeerai/core/pdf_parser.py CHANGED Viewed

@@ -4,13 +4,22 @@ PDF Parser — Entry point for every paper analysis.
 Every analysis we do depends on clean text extraction.
 If this is wrong, everything downstream is wrong.
 So we isolate it, test it, make it bulletproof.
 """
 import fitz  # PyMuPDF
-from dataclasses import dataclass, field
 from pathlib import Path
 @dataclass
 class ParsedPaper:
     """
@@ -29,8 +38,14 @@ class ParsedPaper:
 class PDFParser:
     """
     Handles PDF ingestion and structured text extraction.
-    Class-based because later we may need configuration,
-    different format handling, and caching.
     """
     def __init__(self):
@@ -40,10 +55,12 @@ class PDFParser:
             "related work", "background", "experiments"
         ]
     def parse(self, pdf_path: str) -> ParsedPaper:
         """
-        Main entry point.
-        Takes a PDF path, returns a structured ParsedPaper object.
         """
         pdf_path = Path(pdf_path)
@@ -53,7 +70,30 @@ class PDFParser:
         if pdf_path.suffix.lower() != ".pdf":
             raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
-        doc = fitz.open(str(pdf_path))
         full_text = self._extract_text(doc)
         sections = self._split_into_sections(full_text)
@@ -70,9 +110,52 @@ class PDFParser:
             page_count=page_count,
             has_figures=figure_count > 0,
             figure_count=figure_count,
-            metadata=self._extract_metadata(pdf_path),
         )
     def _extract_text(self, doc: fitz.Document) -> str:
         """Extract all text from every page."""
         pages = []
@@ -83,7 +166,7 @@ class PDFParser:
     def _split_into_sections(self, text: str) -> dict:
         """
         Split paper into named sections by common academic headers.
-        Not perfect — PDFs are messy — but good enough for analysis.
         """
         sections = {}
         text_lower = text.lower()
@@ -113,7 +196,7 @@ class PDFParser:
     def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
         """
-        Try PDF metadata first, fall back to first meaningful line.
         """
         meta = doc.metadata
         if meta and meta.get("title"):
@@ -124,10 +207,4 @@ class PDFParser:
             if len(line) > 10:
                 return line
-        return "Unknown Title"
-    def _extract_metadata(self, pdf_path: Path) -> dict:
-        return {
-            "filename": pdf_path.name,
-            "file_size_kb": round(pdf_path.stat().st_size / 1024, 2),
-        }

 Every analysis we do depends on clean text extraction.
 If this is wrong, everything downstream is wrong.
 So we isolate it, test it, make it bulletproof.
+SciPeerAI v1.5.0 — Built by Sameer Nadeem
 """
+import hashlib
 import fitz  # PyMuPDF
+from dataclasses import dataclass
 from pathlib import Path
+# ── Security constants ────────────────────────────────────────────
+MAX_FILE_SIZE_MB = 50
+MAX_PAGES = 300
+ALLOWED_MIME_HEADER = b"%PDF"  # Every real PDF starts with %PDF
 @dataclass
 class ParsedPaper:
     """
 class PDFParser:
     """
     Handles PDF ingestion and structured text extraction.
+    Supports both file-path parsing and raw-bytes parsing (API uploads).
+    Security hardened:
+    - Magic byte validation (rejects fake PDFs)
+    - File size limit (50 MB)
+    - Page count limit (300 pages)
+    - Filename sanitization
+    - SHA-256 fingerprint per upload
     """
     def __init__(self):
             "related work", "background", "experiments"
         ]
+    # ── Public: parse from disk path ─────────────────────────────
     def parse(self, pdf_path: str) -> ParsedPaper:
         """
+        Parse from a file path on disk.
+        Used internally and in tests.
         """
         pdf_path = Path(pdf_path)
         if pdf_path.suffix.lower() != ".pdf":
             raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
+        raw_bytes = pdf_path.read_bytes()
+        return self.parse_bytes(raw_bytes, filename=pdf_path.name)
+    # ── Public: parse from raw bytes (API upload) ─────────────────
+    def parse_bytes(self, file_bytes: bytes, filename: str = "upload.pdf") -> ParsedPaper:
+        """
+        Parse a PDF from raw bytes — used when file arrives through API.
+        FastAPI UploadFile → await file.read() → pass here.
+        Security checks run before any parsing begins.
+        """
+        filename = self._sanitize_filename(filename)
+        self._validate_bytes(file_bytes, filename)
+        doc = fitz.open(stream=file_bytes, filetype="pdf")
+        if len(doc) > MAX_PAGES:
+            doc.close()
+            raise ValueError(
+                f"Paper has {len(doc)} pages. "
+                f"Maximum allowed is {MAX_PAGES} pages."
+            )
         full_text = self._extract_text(doc)
         sections = self._split_into_sections(full_text)
             page_count=page_count,
             has_figures=figure_count > 0,
             figure_count=figure_count,
+            metadata={
+                "filename": filename,
+                "file_size_kb": round(len(file_bytes) / 1024, 2),
+                "sha256": hashlib.sha256(file_bytes).hexdigest(),
+            },
         )
+    # ── Security helpers ──────────────────────────────────────────
+    def _validate_bytes(self, file_bytes: bytes, filename: str) -> None:
+        """
+        Three security checks before we touch the file:
+        1. Not empty
+        2. Under size limit
+        3. Real PDF magic bytes — not a renamed .exe or .zip
+        """
+        if len(file_bytes) == 0:
+            raise ValueError("Uploaded file is empty.")
+        max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
+        if len(file_bytes) > max_bytes:
+            size_mb = round(len(file_bytes) / 1024 / 1024, 1)
+            raise ValueError(
+                f"File too large: {size_mb} MB. "
+                f"Maximum allowed: {MAX_FILE_SIZE_MB} MB."
+            )
+        if not file_bytes.startswith(ALLOWED_MIME_HEADER):
+            raise ValueError(
+                "Invalid file. Only real PDF files are accepted. "
+                "Renamed or corrupted files are rejected."
+            )
+    @staticmethod
+    def _sanitize_filename(filename: str) -> str:
+        """
+        Strip path traversal characters and enforce .pdf extension.
+        Prevents directory traversal attacks like ../../etc/passwd.pdf
+        """
+        name = Path(filename).name  # strips any directory component
+        if not name.lower().endswith(".pdf"):
+            raise ValueError(f"Expected a PDF filename, got: {filename}")
+        return name
+    # ── Private: extraction logic ──────────────────────��──────────
     def _extract_text(self, doc: fitz.Document) -> str:
         """Extract all text from every page."""
         pages = []
     def _split_into_sections(self, text: str) -> dict:
         """
         Split paper into named sections by common academic headers.
+        Not perfect — PDFs are messy — but good enough for downstream analysis.
         """
         sections = {}
         text_lower = text.lower()
     def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
         """
+        Try PDF metadata first, fall back to first meaningful line of text.
         """
         meta = doc.metadata
         if meta and meta.get("title"):
             if len(line) > 10:
                 return line
+        return "Unknown Title"