import re import fitz import pdfplumber from typing import List, Tuple from src.utils import PageData, PaperMetadata SECTION_PATTERNS = [ r"^abstract$", r"^1\.?\s*introduction", r"^2\.?\s*(?:related work|background)", r"^3\.?\s*(?:method|methodology|our approach)", r"^4\.?\s*experiment", r"^5\.?\s*(?:result|results|evaluation)", r"^6\.?\s*discussion", r"^7\.?\s*conclusion", r"^references$", r"^appendix", r"^\d+\.?\s+[A-Z][a-z]+" # any numbered section ] def extract_text(filepath: str) -> List[PageData]: """ Extracts text from PDF, preferring PyMuPDF blocks for multi-column. Falls back to pdfplumber if text is < 500 chars. """ doc = fitz.open(filepath) pages = [] for page_num, page in enumerate(doc): # Extract font sizes font_sizes = {} dict_blocks = page.get_text("dict").get("blocks", []) for b in dict_blocks: if b.get("type", -1) == 0: block_text = "".join([s.get("text", "") for l in b.get("lines", []) for s in l.get("spans", [])]).strip().lower() spans = [s.get("size", 0) for l in b.get("lines", []) for s in l.get("spans", [])] max_size = max(spans) if spans else 0 if block_text: font_sizes[block_text] = max_size # Use get_text("blocks") for better layout handling blocks = page.get_text("blocks") # Sort top-to-bottom, left-to-right to reconstruct columns # y0 is b[1], x0 is b[0] blocks.sort(key=lambda b: (round(b[1] / 30) * 30, b[0])) text = "\n".join(b[4].strip() for b in blocks if b[6] == 0) # type 0 is text pages.append(PageData( page_num=page_num + 1, text=text, width=page.rect.width, height=page.rect.height, font_sizes=font_sizes )) total_text = " ".join([p.text for p in pages]) if len(total_text.strip()) < 500: pages = _extract_with_pdfplumber(filepath) doc.close() return pages def _extract_with_pdfplumber(filepath: str) -> List[PageData]: pages = [] with pdfplumber.open(filepath) as pdf: for page_num, page in enumerate(pdf.pages): text = page.extract_text() or "" pages.append(PageData( page_num=page_num + 1, text=text, width=float(page.width), height=float(page.height) )) return pages def extract_metadata(filepath: str, pages: List[PageData]) -> PaperMetadata: """Extract metadata using a layered heuristic approach.""" doc = fitz.open(filepath) meta = doc.metadata title = (meta.get("title") or "").strip() authors = (meta.get("author") or "").strip() year = "" # Try year from creationDate (format: D:YYYYMMDDHHmmSSZ) cdate = meta.get("creationDate") or "" if cdate and cdate.startswith("D:"): year = cdate[2:6] # Heuristic 1: title from first page largest text if not title or len(title) < 5 or "Microsoft Word" in title: first_page = doc[0] blocks = first_page.get_text("dict")["blocks"] title_candidates = [] for b in blocks: if b["type"] == 0: for l in b["lines"]: for s in l["spans"]: title_candidates.append((s["text"], s["size"])) if title_candidates: # Get largest font size text title_candidates.sort(key=lambda x: x[1], reverse=True) best_title = " ".join([t[0] for t in title_candidates if t[1] == title_candidates[0][1]]) title = best_title.strip() # Heuristic 2: year from regex on first page if not year and pages: match = re.search(r"(19|20)\d{2}", pages[0].text) if match: year = match.group(0) # Heuristic 3: authors from first page text before abstract if not authors and pages: lines = pages[0].text.split("\n") author_lines = [] for line in lines: if re.match(r"^abstract$", line.strip(), re.IGNORECASE): break if line.strip() and line.strip() != title: # Add if looks like author line (commas, university, emails) if "," in line or "University" in line or "@" in line: author_lines.append(line.strip()) if author_lines: authors = "; ".join(author_lines) doc.close() return PaperMetadata( title=title if title else "Unknown Title", authors=authors if authors else "Unknown Authors", year=year if year else "Unknown Year", doi=meta.get("doi") or "Unknown DOI", n_pages=len(pages), filepath=filepath ) def detect_sections(pages: List[PageData], filepath: str) -> List[PageData]: """Detect sections using regex and font size heuristics.""" current_section = "Abstract" font_sizes = {} all_sizes = [] for page in pages: font_sizes.update(page.font_sizes) all_sizes.extend(page.font_sizes.values()) median_size = sorted(all_sizes)[len(all_sizes)//2] if all_sizes else 10 for page in pages: lines = page.text.split("\n") for line in lines: line_clean = line.strip().lower() if not line_clean: continue # Regex match matched = False for pattern in SECTION_PATTERNS: if re.match(pattern, line_clean, re.IGNORECASE): current_section = line.strip() matched = True break # Font size heuristic if not matched and line_clean in font_sizes: size = font_sizes[line_clean] # If short, single line, and larger font -> likely header if size > median_size + 1.5 and len(line_clean) < 60: current_section = line.strip() page.section = current_section return pages