Abu-Sameer-66 commited on
Commit
a0aeb5a
Β·
1 Parent(s): 0295edd

feat: add PDF upload endpoint with 14-module analysis and security hardening

Browse files
src/scipeerai/api/routes.py CHANGED
@@ -750,7 +750,7 @@ from src.scipeerai.modules.effect_size_validator import EffectSizeValidator
750
  from src.scipeerai.modules.retraction_checker import RetractionChecker
751
  from src.scipeerai.modules.citation_cartel import CitationCartelDetector
752
  from src.scipeerai.modules.llm_detector import LLMDetector
753
-
754
  router = APIRouter(prefix="/api/v1", tags=["Analysis"])
755
 
756
  # ── Section-aware text extraction β€” replaces flat truncation ──────────────────
@@ -897,7 +897,7 @@ _effect_size_engine = EffectSizeValidator()
897
  _retraction_engine = RetractionChecker()
898
  _cartel_engine = CitationCartelDetector()
899
  _llm_engine = LLMDetector()
900
-
901
 
902
  # ── Request / Response Models ─────────────────────────────────────────────────
903
 
@@ -1620,5 +1620,318 @@ def analyze_llm(request: LLMRequest):
1620
  ],
1621
  flags_count = result.flags_count,
1622
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1623
  except Exception as e:
1624
  raise HTTPException(status_code=500, detail=str(e))
 
750
  from src.scipeerai.modules.retraction_checker import RetractionChecker
751
  from src.scipeerai.modules.citation_cartel import CitationCartelDetector
752
  from src.scipeerai.modules.llm_detector import LLMDetector
753
+ from src.scipeerai.core.pdf_parser import PDFParser
754
  router = APIRouter(prefix="/api/v1", tags=["Analysis"])
755
 
756
  # ── Section-aware text extraction β€” replaces flat truncation ──────────────────
 
897
  _retraction_engine = RetractionChecker()
898
  _cartel_engine = CitationCartelDetector()
899
  _llm_engine = LLMDetector()
900
+ _pdf_parser = PDFParser()
901
 
902
  # ── Request / Response Models ─────────────────────────────────────────────────
903
 
 
1620
  ],
1621
  flags_count = result.flags_count,
1622
  )
1623
+ except Exception as e:
1624
+ raise HTTPException(status_code=500, detail=str(e))
1625
+
1626
+ # ── Full PDF Analysis β€” Master Endpoint ──────────────────────────────────────
1627
+
1628
+ class ModuleSummary(BaseModel):
1629
+ module: str
1630
+ risk_level: str
1631
+ risk_score: float
1632
+ summary: str
1633
+ flags_count: int
1634
+
1635
+ class FullPDFResponse(BaseModel):
1636
+ paper_title: str
1637
+ page_count: int
1638
+ figure_count: int
1639
+ file_size_kb: float
1640
+ sha256: str
1641
+ overall_score: float
1642
+ overall_risk: str
1643
+ integrity_verdict: str
1644
+ modules: list[ModuleSummary]
1645
+ top_flags: list[str]
1646
+ analyzed_by: str
1647
+
1648
+
1649
+ def _compute_overall(scores: list[float]) -> tuple[float, str]:
1650
+ avg = round(sum(scores) / len(scores), 3) if scores else 0.0
1651
+ if avg >= 0.7:
1652
+ level = "HIGH"
1653
+ elif avg >= 0.4:
1654
+ level = "MEDIUM"
1655
+ else:
1656
+ level = "LOW"
1657
+ return avg, level
1658
+
1659
+
1660
+ def _verdict(risk: str) -> str:
1661
+ return {
1662
+ "HIGH": "Serious integrity concerns detected. Manual expert review strongly recommended.",
1663
+ "MEDIUM": "Some integrity issues found. Careful review advised before publication.",
1664
+ "LOW": "No major integrity issues detected. Paper appears scientifically sound.",
1665
+ }.get(risk, "Unknown")
1666
+
1667
+
1668
+ @router.post("/analyze/full-pdf", response_model=FullPDFResponse)
1669
+ async def analyze_full_pdf(file: UploadFile = File(...)):
1670
+ """
1671
+ Master endpoint β€” Upload a PDF and run all 14 analysis modules at once.
1672
+ Returns a unified integrity report with per-module scores and top flags.
1673
+ Designed for PhD researchers who want a single comprehensive analysis.
1674
+ """
1675
+ try:
1676
+ file_bytes = await file.read()
1677
+ paper = _pdf_parser.parse_bytes(file_bytes, file.filename)
1678
+ text = paper.full_text
1679
+
1680
+ if len(text.strip()) < 100:
1681
+ raise HTTPException(
1682
+ status_code=422,
1683
+ detail="PDF text extraction failed or paper is too short. "
1684
+ "Ensure the PDF contains selectable text (not a scanned image)."
1685
+ )
1686
+
1687
+ modules_run = []
1688
+ top_flags = []
1689
+ scores = []
1690
+
1691
+ # ── Module 1: Statistical Audit ───────────────────────────
1692
+ try:
1693
+ r = _stat_engine.analyze(_smart_text(text, "statistics"))
1694
+ modules_run.append(ModuleSummary(
1695
+ module="Statistical Audit",
1696
+ risk_level=r.risk_level,
1697
+ risk_score=r.risk_score,
1698
+ summary=r.summary,
1699
+ flags_count=len(r.flags),
1700
+ ))
1701
+ scores.append(r.risk_score)
1702
+ for f in r.flags[:2]:
1703
+ top_flags.append(f"[Statistics] {f.description}")
1704
+ except Exception:
1705
+ pass
1706
+
1707
+ # ── Module 2: Methodology Checker ─────────────────────────
1708
+ try:
1709
+ abstract = paper.sections.get("abstract", "")
1710
+ r = _method_engine.analyze(_smart_text(text, "methodology"), abstract)
1711
+ score = 0.7 if len(r.flags) > 2 else 0.3 if r.flags else 0.1
1712
+ modules_run.append(ModuleSummary(
1713
+ module="Methodology Checker",
1714
+ risk_level="HIGH" if score >= 0.7 else "MEDIUM" if score >= 0.4 else "LOW",
1715
+ risk_score=score,
1716
+ summary=r.llm_assessment or f"{len(r.flags)} methodology issues found.",
1717
+ flags_count=len(r.flags),
1718
+ ))
1719
+ scores.append(score)
1720
+ for f in r.flags[:2]:
1721
+ top_flags.append(f"[Methodology] {f.issue}")
1722
+ except Exception:
1723
+ pass
1724
+
1725
+ # ── Module 3: Citation Integrity ────────────────────────���─
1726
+ try:
1727
+ r = _citation_engine.analyze(_smart_text(text, "citations"), "")
1728
+ modules_run.append(ModuleSummary(
1729
+ module="Citation Integrity",
1730
+ risk_level=r.risk_level,
1731
+ risk_score=r.risk_score,
1732
+ summary=r.summary,
1733
+ flags_count=len(r.flags),
1734
+ ))
1735
+ scores.append(r.risk_score)
1736
+ for f in r.flags[:2]:
1737
+ top_flags.append(f"[Citations] {f.description}")
1738
+ except Exception:
1739
+ pass
1740
+
1741
+ # ── Module 4: Reproducibility ─────────────────────────────
1742
+ try:
1743
+ r = _repro_engine.analyze(_smart_text(text, "reproducibility"))
1744
+ modules_run.append(ModuleSummary(
1745
+ module="Reproducibility Scanner",
1746
+ risk_level=r.risk_level,
1747
+ risk_score=1.0 - r.reproducibility_score,
1748
+ summary=r.summary,
1749
+ flags_count=len(r.flags),
1750
+ ))
1751
+ scores.append(1.0 - r.reproducibility_score)
1752
+ for f in r.flags[:1]:
1753
+ top_flags.append(f"[Reproducibility] {f.description}")
1754
+ except Exception:
1755
+ pass
1756
+
1757
+ # ── Module 5: Novelty ─────────────────────────────────────
1758
+ try:
1759
+ r = _novelty_engine.analyze(
1760
+ _smart_text(text, "novelty", per_section_limit=2000),
1761
+ paper.title,
1762
+ )
1763
+ modules_run.append(ModuleSummary(
1764
+ module="Novelty Scorer",
1765
+ risk_level=r.risk_level,
1766
+ risk_score=getattr(r, "risk_score", 1.0 - r.novelty_score),
1767
+ summary=r.summary,
1768
+ flags_count=len(getattr(r, "flags", []) or []),
1769
+ ))
1770
+ scores.append(getattr(r, "risk_score", 1.0 - r.novelty_score))
1771
+ except Exception:
1772
+ pass
1773
+
1774
+ # ── Module 6: GRIM Test ───────────────────────────────────
1775
+ try:
1776
+ r = _grim_engine.analyze(_smart_text(text, "grim"))
1777
+ modules_run.append(ModuleSummary(
1778
+ module="GRIM Test",
1779
+ risk_level=r.risk_level,
1780
+ risk_score=r.grim_score,
1781
+ summary=r.summary,
1782
+ flags_count=r.flags_count,
1783
+ ))
1784
+ scores.append(r.grim_score)
1785
+ for f in r.flags[:1]:
1786
+ top_flags.append(f"[GRIM] {f.description}")
1787
+ except Exception:
1788
+ pass
1789
+
1790
+ # ── Module 7: SPRITE Test ─────────────────────────────────
1791
+ try:
1792
+ r = _sprite_engine.analyze(_smart_text(text, "sprite"))
1793
+ modules_run.append(ModuleSummary(
1794
+ module="SPRITE Test",
1795
+ risk_level=r.risk_level,
1796
+ risk_score=r.sprite_score,
1797
+ summary=r.summary,
1798
+ flags_count=r.flags_count,
1799
+ ))
1800
+ scores.append(r.sprite_score)
1801
+ except Exception:
1802
+ pass
1803
+
1804
+ # ── Module 8: Granularity ─────────────────────────────────
1805
+ try:
1806
+ r = _granularity_engine.analyze(_smart_text(text, "granularity"))
1807
+ modules_run.append(ModuleSummary(
1808
+ module="Granularity Analyzer",
1809
+ risk_level=r.risk_level,
1810
+ risk_score=r.granularity_score,
1811
+ summary=r.summary,
1812
+ flags_count=r.flags_count,
1813
+ ))
1814
+ scores.append(r.granularity_score)
1815
+ except Exception:
1816
+ pass
1817
+
1818
+ # ── Module 9: P-Curve ─────────────────────────────────────
1819
+ try:
1820
+ r = _pcurve_engine.analyze(_smart_text(text, "pcurve"))
1821
+ modules_run.append(ModuleSummary(
1822
+ module="P-Curve Analyzer",
1823
+ risk_level=r.risk_level,
1824
+ risk_score=r.pcurve_score,
1825
+ summary=r.summary,
1826
+ flags_count=r.flags_count,
1827
+ ))
1828
+ scores.append(r.pcurve_score)
1829
+ for f in r.flags[:1]:
1830
+ top_flags.append(f"[P-Curve] {f.description}")
1831
+ except Exception:
1832
+ pass
1833
+
1834
+ # ── Module 10: Effect Size ────────────────────────────────
1835
+ try:
1836
+ r = _effect_size_engine.analyze(_smart_text(text, "effect_size"))
1837
+ modules_run.append(ModuleSummary(
1838
+ module="Effect Size Validator",
1839
+ risk_level=r.risk_level,
1840
+ risk_score=r.effect_score,
1841
+ summary=r.summary,
1842
+ flags_count=r.flags_count,
1843
+ ))
1844
+ scores.append(r.effect_score)
1845
+ except Exception:
1846
+ pass
1847
+
1848
+ # ── Module 11: Retraction Checker ─────────────────────────
1849
+ try:
1850
+ r = _retraction_engine.analyze(_smart_text(text, "retraction"))
1851
+ modules_run.append(ModuleSummary(
1852
+ module="Retraction Checker",
1853
+ risk_level=r.risk_level,
1854
+ risk_score=r.retraction_score,
1855
+ summary=r.summary,
1856
+ flags_count=r.flags_count,
1857
+ ))
1858
+ scores.append(r.retraction_score)
1859
+ for f in r.flags[:1]:
1860
+ top_flags.append(f"[Retraction] {f.description}")
1861
+ except Exception:
1862
+ pass
1863
+
1864
+ # ── Module 12: Citation Cartel ────────────────────────────
1865
+ try:
1866
+ r = _cartel_engine.analyze(_smart_text(text, "cartel"))
1867
+ modules_run.append(ModuleSummary(
1868
+ module="Citation Cartel Detector",
1869
+ risk_level=r.risk_level,
1870
+ risk_score=r.cartel_score,
1871
+ summary=r.summary,
1872
+ flags_count=r.flags_count,
1873
+ ))
1874
+ scores.append(r.cartel_score)
1875
+ for f in r.flags[:1]:
1876
+ top_flags.append(f"[Cartel] {f.description}")
1877
+ except Exception:
1878
+ pass
1879
+
1880
+ # ── Module 13: LLM Detector ───────────────────────────────
1881
+ try:
1882
+ r = _llm_engine.analyze(_smart_text(text, "llm"))
1883
+ modules_run.append(ModuleSummary(
1884
+ module="LLM Paper Detector",
1885
+ risk_level=r.risk_level,
1886
+ risk_score=r.llm_score,
1887
+ summary=r.summary,
1888
+ flags_count=r.flags_count,
1889
+ ))
1890
+ scores.append(r.llm_score)
1891
+ for f in r.flags[:1]:
1892
+ top_flags.append(f"[LLM] {f.description}")
1893
+ except Exception:
1894
+ pass
1895
+
1896
+ # ── Module 14: Figure Forensics ───────────────────────────
1897
+ try:
1898
+ tmp_path = None
1899
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
1900
+ tmp.write(file_bytes)
1901
+ tmp_path = tmp.name
1902
+ r = _figure_engine.analyze(tmp_path)
1903
+ fig_score = min(len(r.duplicate_pairs) * 0.3, 1.0)
1904
+ modules_run.append(ModuleSummary(
1905
+ module="Figure Forensics",
1906
+ risk_level="HIGH" if fig_score >= 0.7 else "MEDIUM" if fig_score >= 0.3 else "LOW",
1907
+ risk_score=fig_score,
1908
+ summary=f"{r.figures_found} figures found. {len(r.duplicate_pairs)} duplicate pairs detected.",
1909
+ flags_count=len(r.flags),
1910
+ ))
1911
+ scores.append(fig_score)
1912
+ if tmp_path and os.path.exists(tmp_path):
1913
+ os.unlink(tmp_path)
1914
+ except Exception:
1915
+ pass
1916
+
1917
+ # ── Final Score ───────────────────────────────────────────
1918
+ overall_score, overall_risk = _compute_overall(scores)
1919
+
1920
+ return FullPDFResponse(
1921
+ paper_title = paper.title,
1922
+ page_count = paper.page_count,
1923
+ figure_count = paper.figure_count,
1924
+ file_size_kb = paper.metadata.get("file_size_kb", 0.0),
1925
+ sha256 = paper.metadata.get("sha256", ""),
1926
+ overall_score = overall_score,
1927
+ overall_risk = overall_risk,
1928
+ integrity_verdict = _verdict(overall_risk),
1929
+ modules = modules_run,
1930
+ top_flags = top_flags[:10],
1931
+ analyzed_by = "SciPeerAI v1.5.0 β€” 14-Module Pipeline",
1932
+ )
1933
+
1934
+ except HTTPException:
1935
+ raise
1936
  except Exception as e:
1937
  raise HTTPException(status_code=500, detail=str(e))
src/scipeerai/core/pdf_parser.py CHANGED
@@ -4,13 +4,22 @@ PDF Parser β€” Entry point for every paper analysis.
4
  Every analysis we do depends on clean text extraction.
5
  If this is wrong, everything downstream is wrong.
6
  So we isolate it, test it, make it bulletproof.
 
 
7
  """
8
 
 
9
  import fitz # PyMuPDF
10
- from dataclasses import dataclass, field
11
  from pathlib import Path
12
 
13
 
 
 
 
 
 
 
14
  @dataclass
15
  class ParsedPaper:
16
  """
@@ -29,8 +38,14 @@ class ParsedPaper:
29
  class PDFParser:
30
  """
31
  Handles PDF ingestion and structured text extraction.
32
- Class-based because later we may need configuration,
33
- different format handling, and caching.
 
 
 
 
 
 
34
  """
35
 
36
  def __init__(self):
@@ -40,10 +55,12 @@ class PDFParser:
40
  "related work", "background", "experiments"
41
  ]
42
 
 
 
43
  def parse(self, pdf_path: str) -> ParsedPaper:
44
  """
45
- Main entry point.
46
- Takes a PDF path, returns a structured ParsedPaper object.
47
  """
48
  pdf_path = Path(pdf_path)
49
 
@@ -53,7 +70,30 @@ class PDFParser:
53
  if pdf_path.suffix.lower() != ".pdf":
54
  raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
55
 
56
- doc = fitz.open(str(pdf_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  full_text = self._extract_text(doc)
59
  sections = self._split_into_sections(full_text)
@@ -70,9 +110,52 @@ class PDFParser:
70
  page_count=page_count,
71
  has_figures=figure_count > 0,
72
  figure_count=figure_count,
73
- metadata=self._extract_metadata(pdf_path),
 
 
 
 
74
  )
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def _extract_text(self, doc: fitz.Document) -> str:
77
  """Extract all text from every page."""
78
  pages = []
@@ -83,7 +166,7 @@ class PDFParser:
83
  def _split_into_sections(self, text: str) -> dict:
84
  """
85
  Split paper into named sections by common academic headers.
86
- Not perfect β€” PDFs are messy β€” but good enough for analysis.
87
  """
88
  sections = {}
89
  text_lower = text.lower()
@@ -113,7 +196,7 @@ class PDFParser:
113
 
114
  def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
115
  """
116
- Try PDF metadata first, fall back to first meaningful line.
117
  """
118
  meta = doc.metadata
119
  if meta and meta.get("title"):
@@ -124,10 +207,4 @@ class PDFParser:
124
  if len(line) > 10:
125
  return line
126
 
127
- return "Unknown Title"
128
-
129
- def _extract_metadata(self, pdf_path: Path) -> dict:
130
- return {
131
- "filename": pdf_path.name,
132
- "file_size_kb": round(pdf_path.stat().st_size / 1024, 2),
133
- }
 
4
  Every analysis we do depends on clean text extraction.
5
  If this is wrong, everything downstream is wrong.
6
  So we isolate it, test it, make it bulletproof.
7
+
8
+ SciPeerAI v1.5.0 β€” Built by Sameer Nadeem
9
  """
10
 
11
+ import hashlib
12
  import fitz # PyMuPDF
13
+ from dataclasses import dataclass
14
  from pathlib import Path
15
 
16
 
17
+ # ── Security constants ────────────────────────────────────────────
18
+ MAX_FILE_SIZE_MB = 50
19
+ MAX_PAGES = 300
20
+ ALLOWED_MIME_HEADER = b"%PDF" # Every real PDF starts with %PDF
21
+
22
+
23
  @dataclass
24
  class ParsedPaper:
25
  """
 
38
  class PDFParser:
39
  """
40
  Handles PDF ingestion and structured text extraction.
41
+ Supports both file-path parsing and raw-bytes parsing (API uploads).
42
+
43
+ Security hardened:
44
+ - Magic byte validation (rejects fake PDFs)
45
+ - File size limit (50 MB)
46
+ - Page count limit (300 pages)
47
+ - Filename sanitization
48
+ - SHA-256 fingerprint per upload
49
  """
50
 
51
  def __init__(self):
 
55
  "related work", "background", "experiments"
56
  ]
57
 
58
+ # ── Public: parse from disk path ─────────────────────────────
59
+
60
  def parse(self, pdf_path: str) -> ParsedPaper:
61
  """
62
+ Parse from a file path on disk.
63
+ Used internally and in tests.
64
  """
65
  pdf_path = Path(pdf_path)
66
 
 
70
  if pdf_path.suffix.lower() != ".pdf":
71
  raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
72
 
73
+ raw_bytes = pdf_path.read_bytes()
74
+ return self.parse_bytes(raw_bytes, filename=pdf_path.name)
75
+
76
+ # ── Public: parse from raw bytes (API upload) ─────────────────
77
+
78
+ def parse_bytes(self, file_bytes: bytes, filename: str = "upload.pdf") -> ParsedPaper:
79
+ """
80
+ Parse a PDF from raw bytes β€” used when file arrives through API.
81
+ FastAPI UploadFile β†’ await file.read() β†’ pass here.
82
+
83
+ Security checks run before any parsing begins.
84
+ """
85
+ filename = self._sanitize_filename(filename)
86
+
87
+ self._validate_bytes(file_bytes, filename)
88
+
89
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
90
+
91
+ if len(doc) > MAX_PAGES:
92
+ doc.close()
93
+ raise ValueError(
94
+ f"Paper has {len(doc)} pages. "
95
+ f"Maximum allowed is {MAX_PAGES} pages."
96
+ )
97
 
98
  full_text = self._extract_text(doc)
99
  sections = self._split_into_sections(full_text)
 
110
  page_count=page_count,
111
  has_figures=figure_count > 0,
112
  figure_count=figure_count,
113
+ metadata={
114
+ "filename": filename,
115
+ "file_size_kb": round(len(file_bytes) / 1024, 2),
116
+ "sha256": hashlib.sha256(file_bytes).hexdigest(),
117
+ },
118
  )
119
 
120
+ # ── Security helpers ──────────────────────────────────────────
121
+
122
+ def _validate_bytes(self, file_bytes: bytes, filename: str) -> None:
123
+ """
124
+ Three security checks before we touch the file:
125
+ 1. Not empty
126
+ 2. Under size limit
127
+ 3. Real PDF magic bytes β€” not a renamed .exe or .zip
128
+ """
129
+ if len(file_bytes) == 0:
130
+ raise ValueError("Uploaded file is empty.")
131
+
132
+ max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
133
+ if len(file_bytes) > max_bytes:
134
+ size_mb = round(len(file_bytes) / 1024 / 1024, 1)
135
+ raise ValueError(
136
+ f"File too large: {size_mb} MB. "
137
+ f"Maximum allowed: {MAX_FILE_SIZE_MB} MB."
138
+ )
139
+
140
+ if not file_bytes.startswith(ALLOWED_MIME_HEADER):
141
+ raise ValueError(
142
+ "Invalid file. Only real PDF files are accepted. "
143
+ "Renamed or corrupted files are rejected."
144
+ )
145
+
146
+ @staticmethod
147
+ def _sanitize_filename(filename: str) -> str:
148
+ """
149
+ Strip path traversal characters and enforce .pdf extension.
150
+ Prevents directory traversal attacks like ../../etc/passwd.pdf
151
+ """
152
+ name = Path(filename).name # strips any directory component
153
+ if not name.lower().endswith(".pdf"):
154
+ raise ValueError(f"Expected a PDF filename, got: {filename}")
155
+ return name
156
+
157
+ # ── Private: extraction logic ──────────────────────��──────────
158
+
159
  def _extract_text(self, doc: fitz.Document) -> str:
160
  """Extract all text from every page."""
161
  pages = []
 
166
  def _split_into_sections(self, text: str) -> dict:
167
  """
168
  Split paper into named sections by common academic headers.
169
+ Not perfect β€” PDFs are messy β€” but good enough for downstream analysis.
170
  """
171
  sections = {}
172
  text_lower = text.lower()
 
196
 
197
  def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
198
  """
199
+ Try PDF metadata first, fall back to first meaningful line of text.
200
  """
201
  meta = doc.metadata
202
  if meta and meta.get("title"):
 
207
  if len(line) > 10:
208
  return line
209
 
210
+ return "Unknown Title"