Spaces:
Sleeping
Sleeping
Abu-Sameer-66 commited on
Commit Β·
a0aeb5a
1
Parent(s): 0295edd
feat: add PDF upload endpoint with 14-module analysis and security hardening
Browse files- src/scipeerai/api/routes.py +315 -2
- src/scipeerai/core/pdf_parser.py +93 -16
src/scipeerai/api/routes.py
CHANGED
|
@@ -750,7 +750,7 @@ from src.scipeerai.modules.effect_size_validator import EffectSizeValidator
|
|
| 750 |
from src.scipeerai.modules.retraction_checker import RetractionChecker
|
| 751 |
from src.scipeerai.modules.citation_cartel import CitationCartelDetector
|
| 752 |
from src.scipeerai.modules.llm_detector import LLMDetector
|
| 753 |
-
|
| 754 |
router = APIRouter(prefix="/api/v1", tags=["Analysis"])
|
| 755 |
|
| 756 |
# ββ Section-aware text extraction β replaces flat truncation ββββββββββββββββββ
|
|
@@ -897,7 +897,7 @@ _effect_size_engine = EffectSizeValidator()
|
|
| 897 |
_retraction_engine = RetractionChecker()
|
| 898 |
_cartel_engine = CitationCartelDetector()
|
| 899 |
_llm_engine = LLMDetector()
|
| 900 |
-
|
| 901 |
|
| 902 |
# ββ Request / Response Models βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 903 |
|
|
@@ -1620,5 +1620,318 @@ def analyze_llm(request: LLMRequest):
|
|
| 1620 |
],
|
| 1621 |
flags_count = result.flags_count,
|
| 1622 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1623 |
except Exception as e:
|
| 1624 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 750 |
from src.scipeerai.modules.retraction_checker import RetractionChecker
|
| 751 |
from src.scipeerai.modules.citation_cartel import CitationCartelDetector
|
| 752 |
from src.scipeerai.modules.llm_detector import LLMDetector
|
| 753 |
+
from src.scipeerai.core.pdf_parser import PDFParser
|
| 754 |
router = APIRouter(prefix="/api/v1", tags=["Analysis"])
|
| 755 |
|
| 756 |
# ββ Section-aware text extraction β replaces flat truncation ββββββββββββββββββ
|
|
|
|
| 897 |
_retraction_engine = RetractionChecker()
|
| 898 |
_cartel_engine = CitationCartelDetector()
|
| 899 |
_llm_engine = LLMDetector()
|
| 900 |
+
_pdf_parser = PDFParser()
|
| 901 |
|
| 902 |
# ββ Request / Response Models βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 903 |
|
|
|
|
| 1620 |
],
|
| 1621 |
flags_count = result.flags_count,
|
| 1622 |
)
|
| 1623 |
+
except Exception as e:
|
| 1624 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1625 |
+
|
| 1626 |
+
# ββ Full PDF Analysis β Master Endpoint ββββββββββββββββββββββββββββββββββββββ
|
| 1627 |
+
|
| 1628 |
+
class ModuleSummary(BaseModel):
|
| 1629 |
+
module: str
|
| 1630 |
+
risk_level: str
|
| 1631 |
+
risk_score: float
|
| 1632 |
+
summary: str
|
| 1633 |
+
flags_count: int
|
| 1634 |
+
|
| 1635 |
+
class FullPDFResponse(BaseModel):
|
| 1636 |
+
paper_title: str
|
| 1637 |
+
page_count: int
|
| 1638 |
+
figure_count: int
|
| 1639 |
+
file_size_kb: float
|
| 1640 |
+
sha256: str
|
| 1641 |
+
overall_score: float
|
| 1642 |
+
overall_risk: str
|
| 1643 |
+
integrity_verdict: str
|
| 1644 |
+
modules: list[ModuleSummary]
|
| 1645 |
+
top_flags: list[str]
|
| 1646 |
+
analyzed_by: str
|
| 1647 |
+
|
| 1648 |
+
|
| 1649 |
+
def _compute_overall(scores: list[float]) -> tuple[float, str]:
|
| 1650 |
+
avg = round(sum(scores) / len(scores), 3) if scores else 0.0
|
| 1651 |
+
if avg >= 0.7:
|
| 1652 |
+
level = "HIGH"
|
| 1653 |
+
elif avg >= 0.4:
|
| 1654 |
+
level = "MEDIUM"
|
| 1655 |
+
else:
|
| 1656 |
+
level = "LOW"
|
| 1657 |
+
return avg, level
|
| 1658 |
+
|
| 1659 |
+
|
| 1660 |
+
def _verdict(risk: str) -> str:
|
| 1661 |
+
return {
|
| 1662 |
+
"HIGH": "Serious integrity concerns detected. Manual expert review strongly recommended.",
|
| 1663 |
+
"MEDIUM": "Some integrity issues found. Careful review advised before publication.",
|
| 1664 |
+
"LOW": "No major integrity issues detected. Paper appears scientifically sound.",
|
| 1665 |
+
}.get(risk, "Unknown")
|
| 1666 |
+
|
| 1667 |
+
|
| 1668 |
+
@router.post("/analyze/full-pdf", response_model=FullPDFResponse)
|
| 1669 |
+
async def analyze_full_pdf(file: UploadFile = File(...)):
|
| 1670 |
+
"""
|
| 1671 |
+
Master endpoint β Upload a PDF and run all 14 analysis modules at once.
|
| 1672 |
+
Returns a unified integrity report with per-module scores and top flags.
|
| 1673 |
+
Designed for PhD researchers who want a single comprehensive analysis.
|
| 1674 |
+
"""
|
| 1675 |
+
try:
|
| 1676 |
+
file_bytes = await file.read()
|
| 1677 |
+
paper = _pdf_parser.parse_bytes(file_bytes, file.filename)
|
| 1678 |
+
text = paper.full_text
|
| 1679 |
+
|
| 1680 |
+
if len(text.strip()) < 100:
|
| 1681 |
+
raise HTTPException(
|
| 1682 |
+
status_code=422,
|
| 1683 |
+
detail="PDF text extraction failed or paper is too short. "
|
| 1684 |
+
"Ensure the PDF contains selectable text (not a scanned image)."
|
| 1685 |
+
)
|
| 1686 |
+
|
| 1687 |
+
modules_run = []
|
| 1688 |
+
top_flags = []
|
| 1689 |
+
scores = []
|
| 1690 |
+
|
| 1691 |
+
# ββ Module 1: Statistical Audit βββββββββββββββββββββββββββ
|
| 1692 |
+
try:
|
| 1693 |
+
r = _stat_engine.analyze(_smart_text(text, "statistics"))
|
| 1694 |
+
modules_run.append(ModuleSummary(
|
| 1695 |
+
module="Statistical Audit",
|
| 1696 |
+
risk_level=r.risk_level,
|
| 1697 |
+
risk_score=r.risk_score,
|
| 1698 |
+
summary=r.summary,
|
| 1699 |
+
flags_count=len(r.flags),
|
| 1700 |
+
))
|
| 1701 |
+
scores.append(r.risk_score)
|
| 1702 |
+
for f in r.flags[:2]:
|
| 1703 |
+
top_flags.append(f"[Statistics] {f.description}")
|
| 1704 |
+
except Exception:
|
| 1705 |
+
pass
|
| 1706 |
+
|
| 1707 |
+
# ββ Module 2: Methodology Checker βββββββββββββββββββββββββ
|
| 1708 |
+
try:
|
| 1709 |
+
abstract = paper.sections.get("abstract", "")
|
| 1710 |
+
r = _method_engine.analyze(_smart_text(text, "methodology"), abstract)
|
| 1711 |
+
score = 0.7 if len(r.flags) > 2 else 0.3 if r.flags else 0.1
|
| 1712 |
+
modules_run.append(ModuleSummary(
|
| 1713 |
+
module="Methodology Checker",
|
| 1714 |
+
risk_level="HIGH" if score >= 0.7 else "MEDIUM" if score >= 0.4 else "LOW",
|
| 1715 |
+
risk_score=score,
|
| 1716 |
+
summary=r.llm_assessment or f"{len(r.flags)} methodology issues found.",
|
| 1717 |
+
flags_count=len(r.flags),
|
| 1718 |
+
))
|
| 1719 |
+
scores.append(score)
|
| 1720 |
+
for f in r.flags[:2]:
|
| 1721 |
+
top_flags.append(f"[Methodology] {f.issue}")
|
| 1722 |
+
except Exception:
|
| 1723 |
+
pass
|
| 1724 |
+
|
| 1725 |
+
# ββ Module 3: Citation Integrity ββββββββββββββββββββββββοΏ½οΏ½οΏ½β
|
| 1726 |
+
try:
|
| 1727 |
+
r = _citation_engine.analyze(_smart_text(text, "citations"), "")
|
| 1728 |
+
modules_run.append(ModuleSummary(
|
| 1729 |
+
module="Citation Integrity",
|
| 1730 |
+
risk_level=r.risk_level,
|
| 1731 |
+
risk_score=r.risk_score,
|
| 1732 |
+
summary=r.summary,
|
| 1733 |
+
flags_count=len(r.flags),
|
| 1734 |
+
))
|
| 1735 |
+
scores.append(r.risk_score)
|
| 1736 |
+
for f in r.flags[:2]:
|
| 1737 |
+
top_flags.append(f"[Citations] {f.description}")
|
| 1738 |
+
except Exception:
|
| 1739 |
+
pass
|
| 1740 |
+
|
| 1741 |
+
# ββ Module 4: Reproducibility βββββββββββββββββββββββββββββ
|
| 1742 |
+
try:
|
| 1743 |
+
r = _repro_engine.analyze(_smart_text(text, "reproducibility"))
|
| 1744 |
+
modules_run.append(ModuleSummary(
|
| 1745 |
+
module="Reproducibility Scanner",
|
| 1746 |
+
risk_level=r.risk_level,
|
| 1747 |
+
risk_score=1.0 - r.reproducibility_score,
|
| 1748 |
+
summary=r.summary,
|
| 1749 |
+
flags_count=len(r.flags),
|
| 1750 |
+
))
|
| 1751 |
+
scores.append(1.0 - r.reproducibility_score)
|
| 1752 |
+
for f in r.flags[:1]:
|
| 1753 |
+
top_flags.append(f"[Reproducibility] {f.description}")
|
| 1754 |
+
except Exception:
|
| 1755 |
+
pass
|
| 1756 |
+
|
| 1757 |
+
# ββ Module 5: Novelty βββββββββββββββββββββββββββββββββββββ
|
| 1758 |
+
try:
|
| 1759 |
+
r = _novelty_engine.analyze(
|
| 1760 |
+
_smart_text(text, "novelty", per_section_limit=2000),
|
| 1761 |
+
paper.title,
|
| 1762 |
+
)
|
| 1763 |
+
modules_run.append(ModuleSummary(
|
| 1764 |
+
module="Novelty Scorer",
|
| 1765 |
+
risk_level=r.risk_level,
|
| 1766 |
+
risk_score=getattr(r, "risk_score", 1.0 - r.novelty_score),
|
| 1767 |
+
summary=r.summary,
|
| 1768 |
+
flags_count=len(getattr(r, "flags", []) or []),
|
| 1769 |
+
))
|
| 1770 |
+
scores.append(getattr(r, "risk_score", 1.0 - r.novelty_score))
|
| 1771 |
+
except Exception:
|
| 1772 |
+
pass
|
| 1773 |
+
|
| 1774 |
+
# ββ Module 6: GRIM Test βββββββββββββββββββββββββββββββββββ
|
| 1775 |
+
try:
|
| 1776 |
+
r = _grim_engine.analyze(_smart_text(text, "grim"))
|
| 1777 |
+
modules_run.append(ModuleSummary(
|
| 1778 |
+
module="GRIM Test",
|
| 1779 |
+
risk_level=r.risk_level,
|
| 1780 |
+
risk_score=r.grim_score,
|
| 1781 |
+
summary=r.summary,
|
| 1782 |
+
flags_count=r.flags_count,
|
| 1783 |
+
))
|
| 1784 |
+
scores.append(r.grim_score)
|
| 1785 |
+
for f in r.flags[:1]:
|
| 1786 |
+
top_flags.append(f"[GRIM] {f.description}")
|
| 1787 |
+
except Exception:
|
| 1788 |
+
pass
|
| 1789 |
+
|
| 1790 |
+
# ββ Module 7: SPRITE Test βββββββββββββββββββββββββββββββββ
|
| 1791 |
+
try:
|
| 1792 |
+
r = _sprite_engine.analyze(_smart_text(text, "sprite"))
|
| 1793 |
+
modules_run.append(ModuleSummary(
|
| 1794 |
+
module="SPRITE Test",
|
| 1795 |
+
risk_level=r.risk_level,
|
| 1796 |
+
risk_score=r.sprite_score,
|
| 1797 |
+
summary=r.summary,
|
| 1798 |
+
flags_count=r.flags_count,
|
| 1799 |
+
))
|
| 1800 |
+
scores.append(r.sprite_score)
|
| 1801 |
+
except Exception:
|
| 1802 |
+
pass
|
| 1803 |
+
|
| 1804 |
+
# ββ Module 8: Granularity βββββββββββββββββββββββββββββββββ
|
| 1805 |
+
try:
|
| 1806 |
+
r = _granularity_engine.analyze(_smart_text(text, "granularity"))
|
| 1807 |
+
modules_run.append(ModuleSummary(
|
| 1808 |
+
module="Granularity Analyzer",
|
| 1809 |
+
risk_level=r.risk_level,
|
| 1810 |
+
risk_score=r.granularity_score,
|
| 1811 |
+
summary=r.summary,
|
| 1812 |
+
flags_count=r.flags_count,
|
| 1813 |
+
))
|
| 1814 |
+
scores.append(r.granularity_score)
|
| 1815 |
+
except Exception:
|
| 1816 |
+
pass
|
| 1817 |
+
|
| 1818 |
+
# ββ Module 9: P-Curve βββββββββββββββββββββββββββββββββββββ
|
| 1819 |
+
try:
|
| 1820 |
+
r = _pcurve_engine.analyze(_smart_text(text, "pcurve"))
|
| 1821 |
+
modules_run.append(ModuleSummary(
|
| 1822 |
+
module="P-Curve Analyzer",
|
| 1823 |
+
risk_level=r.risk_level,
|
| 1824 |
+
risk_score=r.pcurve_score,
|
| 1825 |
+
summary=r.summary,
|
| 1826 |
+
flags_count=r.flags_count,
|
| 1827 |
+
))
|
| 1828 |
+
scores.append(r.pcurve_score)
|
| 1829 |
+
for f in r.flags[:1]:
|
| 1830 |
+
top_flags.append(f"[P-Curve] {f.description}")
|
| 1831 |
+
except Exception:
|
| 1832 |
+
pass
|
| 1833 |
+
|
| 1834 |
+
# ββ Module 10: Effect Size ββββββββββββββββββββββββββββββββ
|
| 1835 |
+
try:
|
| 1836 |
+
r = _effect_size_engine.analyze(_smart_text(text, "effect_size"))
|
| 1837 |
+
modules_run.append(ModuleSummary(
|
| 1838 |
+
module="Effect Size Validator",
|
| 1839 |
+
risk_level=r.risk_level,
|
| 1840 |
+
risk_score=r.effect_score,
|
| 1841 |
+
summary=r.summary,
|
| 1842 |
+
flags_count=r.flags_count,
|
| 1843 |
+
))
|
| 1844 |
+
scores.append(r.effect_score)
|
| 1845 |
+
except Exception:
|
| 1846 |
+
pass
|
| 1847 |
+
|
| 1848 |
+
# ββ Module 11: Retraction Checker βββββββββββββββββββββββββ
|
| 1849 |
+
try:
|
| 1850 |
+
r = _retraction_engine.analyze(_smart_text(text, "retraction"))
|
| 1851 |
+
modules_run.append(ModuleSummary(
|
| 1852 |
+
module="Retraction Checker",
|
| 1853 |
+
risk_level=r.risk_level,
|
| 1854 |
+
risk_score=r.retraction_score,
|
| 1855 |
+
summary=r.summary,
|
| 1856 |
+
flags_count=r.flags_count,
|
| 1857 |
+
))
|
| 1858 |
+
scores.append(r.retraction_score)
|
| 1859 |
+
for f in r.flags[:1]:
|
| 1860 |
+
top_flags.append(f"[Retraction] {f.description}")
|
| 1861 |
+
except Exception:
|
| 1862 |
+
pass
|
| 1863 |
+
|
| 1864 |
+
# ββ Module 12: Citation Cartel ββββββββββββββββββββββββββββ
|
| 1865 |
+
try:
|
| 1866 |
+
r = _cartel_engine.analyze(_smart_text(text, "cartel"))
|
| 1867 |
+
modules_run.append(ModuleSummary(
|
| 1868 |
+
module="Citation Cartel Detector",
|
| 1869 |
+
risk_level=r.risk_level,
|
| 1870 |
+
risk_score=r.cartel_score,
|
| 1871 |
+
summary=r.summary,
|
| 1872 |
+
flags_count=r.flags_count,
|
| 1873 |
+
))
|
| 1874 |
+
scores.append(r.cartel_score)
|
| 1875 |
+
for f in r.flags[:1]:
|
| 1876 |
+
top_flags.append(f"[Cartel] {f.description}")
|
| 1877 |
+
except Exception:
|
| 1878 |
+
pass
|
| 1879 |
+
|
| 1880 |
+
# ββ Module 13: LLM Detector βββββββββββββββββββββββββββββββ
|
| 1881 |
+
try:
|
| 1882 |
+
r = _llm_engine.analyze(_smart_text(text, "llm"))
|
| 1883 |
+
modules_run.append(ModuleSummary(
|
| 1884 |
+
module="LLM Paper Detector",
|
| 1885 |
+
risk_level=r.risk_level,
|
| 1886 |
+
risk_score=r.llm_score,
|
| 1887 |
+
summary=r.summary,
|
| 1888 |
+
flags_count=r.flags_count,
|
| 1889 |
+
))
|
| 1890 |
+
scores.append(r.llm_score)
|
| 1891 |
+
for f in r.flags[:1]:
|
| 1892 |
+
top_flags.append(f"[LLM] {f.description}")
|
| 1893 |
+
except Exception:
|
| 1894 |
+
pass
|
| 1895 |
+
|
| 1896 |
+
# ββ Module 14: Figure Forensics βββββββββββββββββββββββββββ
|
| 1897 |
+
try:
|
| 1898 |
+
tmp_path = None
|
| 1899 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
| 1900 |
+
tmp.write(file_bytes)
|
| 1901 |
+
tmp_path = tmp.name
|
| 1902 |
+
r = _figure_engine.analyze(tmp_path)
|
| 1903 |
+
fig_score = min(len(r.duplicate_pairs) * 0.3, 1.0)
|
| 1904 |
+
modules_run.append(ModuleSummary(
|
| 1905 |
+
module="Figure Forensics",
|
| 1906 |
+
risk_level="HIGH" if fig_score >= 0.7 else "MEDIUM" if fig_score >= 0.3 else "LOW",
|
| 1907 |
+
risk_score=fig_score,
|
| 1908 |
+
summary=f"{r.figures_found} figures found. {len(r.duplicate_pairs)} duplicate pairs detected.",
|
| 1909 |
+
flags_count=len(r.flags),
|
| 1910 |
+
))
|
| 1911 |
+
scores.append(fig_score)
|
| 1912 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 1913 |
+
os.unlink(tmp_path)
|
| 1914 |
+
except Exception:
|
| 1915 |
+
pass
|
| 1916 |
+
|
| 1917 |
+
# ββ Final Score βββββββββββββββββββββββββββββββββββββββββββ
|
| 1918 |
+
overall_score, overall_risk = _compute_overall(scores)
|
| 1919 |
+
|
| 1920 |
+
return FullPDFResponse(
|
| 1921 |
+
paper_title = paper.title,
|
| 1922 |
+
page_count = paper.page_count,
|
| 1923 |
+
figure_count = paper.figure_count,
|
| 1924 |
+
file_size_kb = paper.metadata.get("file_size_kb", 0.0),
|
| 1925 |
+
sha256 = paper.metadata.get("sha256", ""),
|
| 1926 |
+
overall_score = overall_score,
|
| 1927 |
+
overall_risk = overall_risk,
|
| 1928 |
+
integrity_verdict = _verdict(overall_risk),
|
| 1929 |
+
modules = modules_run,
|
| 1930 |
+
top_flags = top_flags[:10],
|
| 1931 |
+
analyzed_by = "SciPeerAI v1.5.0 β 14-Module Pipeline",
|
| 1932 |
+
)
|
| 1933 |
+
|
| 1934 |
+
except HTTPException:
|
| 1935 |
+
raise
|
| 1936 |
except Exception as e:
|
| 1937 |
raise HTTPException(status_code=500, detail=str(e))
|
src/scipeerai/core/pdf_parser.py
CHANGED
|
@@ -4,13 +4,22 @@ PDF Parser β Entry point for every paper analysis.
|
|
| 4 |
Every analysis we do depends on clean text extraction.
|
| 5 |
If this is wrong, everything downstream is wrong.
|
| 6 |
So we isolate it, test it, make it bulletproof.
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
|
|
|
| 9 |
import fitz # PyMuPDF
|
| 10 |
-
from dataclasses import dataclass
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
@dataclass
|
| 15 |
class ParsedPaper:
|
| 16 |
"""
|
|
@@ -29,8 +38,14 @@ class ParsedPaper:
|
|
| 29 |
class PDFParser:
|
| 30 |
"""
|
| 31 |
Handles PDF ingestion and structured text extraction.
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
|
| 36 |
def __init__(self):
|
|
@@ -40,10 +55,12 @@ class PDFParser:
|
|
| 40 |
"related work", "background", "experiments"
|
| 41 |
]
|
| 42 |
|
|
|
|
|
|
|
| 43 |
def parse(self, pdf_path: str) -> ParsedPaper:
|
| 44 |
"""
|
| 45 |
-
|
| 46 |
-
|
| 47 |
"""
|
| 48 |
pdf_path = Path(pdf_path)
|
| 49 |
|
|
@@ -53,7 +70,30 @@ class PDFParser:
|
|
| 53 |
if pdf_path.suffix.lower() != ".pdf":
|
| 54 |
raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
full_text = self._extract_text(doc)
|
| 59 |
sections = self._split_into_sections(full_text)
|
|
@@ -70,9 +110,52 @@ class PDFParser:
|
|
| 70 |
page_count=page_count,
|
| 71 |
has_figures=figure_count > 0,
|
| 72 |
figure_count=figure_count,
|
| 73 |
-
metadata=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
)
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def _extract_text(self, doc: fitz.Document) -> str:
|
| 77 |
"""Extract all text from every page."""
|
| 78 |
pages = []
|
|
@@ -83,7 +166,7 @@ class PDFParser:
|
|
| 83 |
def _split_into_sections(self, text: str) -> dict:
|
| 84 |
"""
|
| 85 |
Split paper into named sections by common academic headers.
|
| 86 |
-
Not perfect β PDFs are messy β but good enough for analysis.
|
| 87 |
"""
|
| 88 |
sections = {}
|
| 89 |
text_lower = text.lower()
|
|
@@ -113,7 +196,7 @@ class PDFParser:
|
|
| 113 |
|
| 114 |
def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
|
| 115 |
"""
|
| 116 |
-
Try PDF metadata first, fall back to first meaningful line.
|
| 117 |
"""
|
| 118 |
meta = doc.metadata
|
| 119 |
if meta and meta.get("title"):
|
|
@@ -124,10 +207,4 @@ class PDFParser:
|
|
| 124 |
if len(line) > 10:
|
| 125 |
return line
|
| 126 |
|
| 127 |
-
return "Unknown Title"
|
| 128 |
-
|
| 129 |
-
def _extract_metadata(self, pdf_path: Path) -> dict:
|
| 130 |
-
return {
|
| 131 |
-
"filename": pdf_path.name,
|
| 132 |
-
"file_size_kb": round(pdf_path.stat().st_size / 1024, 2),
|
| 133 |
-
}
|
|
|
|
| 4 |
Every analysis we do depends on clean text extraction.
|
| 5 |
If this is wrong, everything downstream is wrong.
|
| 6 |
So we isolate it, test it, make it bulletproof.
|
| 7 |
+
|
| 8 |
+
SciPeerAI v1.5.0 β Built by Sameer Nadeem
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
import hashlib
|
| 12 |
import fitz # PyMuPDF
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
from pathlib import Path
|
| 15 |
|
| 16 |
|
| 17 |
+
# ββ Security constants ββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
MAX_FILE_SIZE_MB = 50
|
| 19 |
+
MAX_PAGES = 300
|
| 20 |
+
ALLOWED_MIME_HEADER = b"%PDF" # Every real PDF starts with %PDF
|
| 21 |
+
|
| 22 |
+
|
| 23 |
@dataclass
|
| 24 |
class ParsedPaper:
|
| 25 |
"""
|
|
|
|
| 38 |
class PDFParser:
|
| 39 |
"""
|
| 40 |
Handles PDF ingestion and structured text extraction.
|
| 41 |
+
Supports both file-path parsing and raw-bytes parsing (API uploads).
|
| 42 |
+
|
| 43 |
+
Security hardened:
|
| 44 |
+
- Magic byte validation (rejects fake PDFs)
|
| 45 |
+
- File size limit (50 MB)
|
| 46 |
+
- Page count limit (300 pages)
|
| 47 |
+
- Filename sanitization
|
| 48 |
+
- SHA-256 fingerprint per upload
|
| 49 |
"""
|
| 50 |
|
| 51 |
def __init__(self):
|
|
|
|
| 55 |
"related work", "background", "experiments"
|
| 56 |
]
|
| 57 |
|
| 58 |
+
# ββ Public: parse from disk path βββββββββββββββββββββββββββββ
|
| 59 |
+
|
| 60 |
def parse(self, pdf_path: str) -> ParsedPaper:
|
| 61 |
"""
|
| 62 |
+
Parse from a file path on disk.
|
| 63 |
+
Used internally and in tests.
|
| 64 |
"""
|
| 65 |
pdf_path = Path(pdf_path)
|
| 66 |
|
|
|
|
| 70 |
if pdf_path.suffix.lower() != ".pdf":
|
| 71 |
raise ValueError(f"Expected PDF file, got: {pdf_path.suffix}")
|
| 72 |
|
| 73 |
+
raw_bytes = pdf_path.read_bytes()
|
| 74 |
+
return self.parse_bytes(raw_bytes, filename=pdf_path.name)
|
| 75 |
+
|
| 76 |
+
# ββ Public: parse from raw bytes (API upload) βββββββββββββββββ
|
| 77 |
+
|
| 78 |
+
def parse_bytes(self, file_bytes: bytes, filename: str = "upload.pdf") -> ParsedPaper:
|
| 79 |
+
"""
|
| 80 |
+
Parse a PDF from raw bytes β used when file arrives through API.
|
| 81 |
+
FastAPI UploadFile β await file.read() β pass here.
|
| 82 |
+
|
| 83 |
+
Security checks run before any parsing begins.
|
| 84 |
+
"""
|
| 85 |
+
filename = self._sanitize_filename(filename)
|
| 86 |
+
|
| 87 |
+
self._validate_bytes(file_bytes, filename)
|
| 88 |
+
|
| 89 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 90 |
+
|
| 91 |
+
if len(doc) > MAX_PAGES:
|
| 92 |
+
doc.close()
|
| 93 |
+
raise ValueError(
|
| 94 |
+
f"Paper has {len(doc)} pages. "
|
| 95 |
+
f"Maximum allowed is {MAX_PAGES} pages."
|
| 96 |
+
)
|
| 97 |
|
| 98 |
full_text = self._extract_text(doc)
|
| 99 |
sections = self._split_into_sections(full_text)
|
|
|
|
| 110 |
page_count=page_count,
|
| 111 |
has_figures=figure_count > 0,
|
| 112 |
figure_count=figure_count,
|
| 113 |
+
metadata={
|
| 114 |
+
"filename": filename,
|
| 115 |
+
"file_size_kb": round(len(file_bytes) / 1024, 2),
|
| 116 |
+
"sha256": hashlib.sha256(file_bytes).hexdigest(),
|
| 117 |
+
},
|
| 118 |
)
|
| 119 |
|
| 120 |
+
# ββ Security helpers ββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
+
|
| 122 |
+
def _validate_bytes(self, file_bytes: bytes, filename: str) -> None:
|
| 123 |
+
"""
|
| 124 |
+
Three security checks before we touch the file:
|
| 125 |
+
1. Not empty
|
| 126 |
+
2. Under size limit
|
| 127 |
+
3. Real PDF magic bytes β not a renamed .exe or .zip
|
| 128 |
+
"""
|
| 129 |
+
if len(file_bytes) == 0:
|
| 130 |
+
raise ValueError("Uploaded file is empty.")
|
| 131 |
+
|
| 132 |
+
max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
|
| 133 |
+
if len(file_bytes) > max_bytes:
|
| 134 |
+
size_mb = round(len(file_bytes) / 1024 / 1024, 1)
|
| 135 |
+
raise ValueError(
|
| 136 |
+
f"File too large: {size_mb} MB. "
|
| 137 |
+
f"Maximum allowed: {MAX_FILE_SIZE_MB} MB."
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if not file_bytes.startswith(ALLOWED_MIME_HEADER):
|
| 141 |
+
raise ValueError(
|
| 142 |
+
"Invalid file. Only real PDF files are accepted. "
|
| 143 |
+
"Renamed or corrupted files are rejected."
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
@staticmethod
|
| 147 |
+
def _sanitize_filename(filename: str) -> str:
|
| 148 |
+
"""
|
| 149 |
+
Strip path traversal characters and enforce .pdf extension.
|
| 150 |
+
Prevents directory traversal attacks like ../../etc/passwd.pdf
|
| 151 |
+
"""
|
| 152 |
+
name = Path(filename).name # strips any directory component
|
| 153 |
+
if not name.lower().endswith(".pdf"):
|
| 154 |
+
raise ValueError(f"Expected a PDF filename, got: {filename}")
|
| 155 |
+
return name
|
| 156 |
+
|
| 157 |
+
# ββ Private: extraction logic ββββββββββββββββββββββοΏ½οΏ½ββββββββββ
|
| 158 |
+
|
| 159 |
def _extract_text(self, doc: fitz.Document) -> str:
|
| 160 |
"""Extract all text from every page."""
|
| 161 |
pages = []
|
|
|
|
| 166 |
def _split_into_sections(self, text: str) -> dict:
|
| 167 |
"""
|
| 168 |
Split paper into named sections by common academic headers.
|
| 169 |
+
Not perfect β PDFs are messy β but good enough for downstream analysis.
|
| 170 |
"""
|
| 171 |
sections = {}
|
| 172 |
text_lower = text.lower()
|
|
|
|
| 196 |
|
| 197 |
def _extract_title(self, doc: fitz.Document, full_text: str) -> str:
|
| 198 |
"""
|
| 199 |
+
Try PDF metadata first, fall back to first meaningful line of text.
|
| 200 |
"""
|
| 201 |
meta = doc.metadata
|
| 202 |
if meta and meta.get("title"):
|
|
|
|
| 207 |
if len(line) > 10:
|
| 208 |
return line
|
| 209 |
|
| 210 |
+
return "Unknown Title"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|