Spaces:
Running on Zero
Running on Zero
File size: 1,185 Bytes
b5e0c74 9707a84 b5e0c74 9707a84 b5e0c74 9707a84 b5e0c74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import re
from pathlib import Path
from urllib.parse import urlparse
from app.core.models import SourceType
ARXIV_RE = re.compile(r"(?:arxiv\.org/(?:abs|pdf)/)?(?P<id>\d{4}\.\d{4,5})(?:v\d+)?", re.I)
MEDIUM_HOST_PARTS = ("medium.com", "freedium")
def detect_source(url: str | None, pdf_path: str | None) -> SourceType:
if pdf_path:
suffix = Path(pdf_path).suffix.lower()
if suffix == ".pdf":
return SourceType.PDF
raise ValueError("Uploaded file must be a PDF.")
if not url or not url.strip():
raise ValueError("Provide a Medium article link, arXiv link/ID, or upload a PDF.")
clean_url = url.strip()
parsed = urlparse(clean_url)
host = parsed.netloc.lower()
if "arxiv.org" in host or ARXIV_RE.search(clean_url):
return SourceType.ARXIV
if parsed.scheme in {"http", "https"}:
return SourceType.MEDIUM
raise ValueError("Could not detect source type. Use a Medium URL, arXiv URL/ID, or PDF.")
def extract_arxiv_id(value: str) -> str:
match = ARXIV_RE.search(value.strip())
if not match:
raise ValueError("Could not find a valid arXiv ID.")
return match.group("id")
|