Spaces:
Running
Running
| import re | |
| import fitz | |
| import pdfplumber | |
| from typing import List, Tuple | |
| from src.utils import PageData, PaperMetadata | |
| SECTION_PATTERNS = [ | |
| r"^abstract$", | |
| r"^1\.?\s*introduction", | |
| r"^2\.?\s*(?:related work|background)", | |
| r"^3\.?\s*(?:method|methodology|our approach)", | |
| r"^4\.?\s*experiment", | |
| r"^5\.?\s*(?:result|results|evaluation)", | |
| r"^6\.?\s*discussion", | |
| r"^7\.?\s*conclusion", | |
| r"^references$", | |
| r"^appendix", | |
| r"^\d+\.?\s+[A-Z][a-z]+" # any numbered section | |
| ] | |
| def extract_text(filepath: str) -> List[PageData]: | |
| """ | |
| Extracts text from PDF, preferring PyMuPDF blocks for multi-column. | |
| Falls back to pdfplumber if text is < 500 chars. | |
| """ | |
| doc = fitz.open(filepath) | |
| pages = [] | |
| for page_num, page in enumerate(doc): | |
| # Extract font sizes | |
| font_sizes = {} | |
| dict_blocks = page.get_text("dict").get("blocks", []) | |
| for b in dict_blocks: | |
| if b.get("type", -1) == 0: | |
| block_text = "".join([s.get("text", "") for l in b.get("lines", []) for s in l.get("spans", [])]).strip().lower() | |
| spans = [s.get("size", 0) for l in b.get("lines", []) for s in l.get("spans", [])] | |
| max_size = max(spans) if spans else 0 | |
| if block_text: | |
| font_sizes[block_text] = max_size | |
| # Use get_text("blocks") for better layout handling | |
| blocks = page.get_text("blocks") | |
| # Sort top-to-bottom, left-to-right to reconstruct columns | |
| # y0 is b[1], x0 is b[0] | |
| blocks.sort(key=lambda b: (round(b[1] / 30) * 30, b[0])) | |
| text = "\n".join(b[4].strip() for b in blocks if b[6] == 0) # type 0 is text | |
| pages.append(PageData( | |
| page_num=page_num + 1, | |
| text=text, | |
| width=page.rect.width, | |
| height=page.rect.height, | |
| font_sizes=font_sizes | |
| )) | |
| total_text = " ".join([p.text for p in pages]) | |
| if len(total_text.strip()) < 500: | |
| pages = _extract_with_pdfplumber(filepath) | |
| doc.close() | |
| return pages | |
| def _extract_with_pdfplumber(filepath: str) -> List[PageData]: | |
| pages = [] | |
| with pdfplumber.open(filepath) as pdf: | |
| for page_num, page in enumerate(pdf.pages): | |
| text = page.extract_text() or "" | |
| pages.append(PageData( | |
| page_num=page_num + 1, | |
| text=text, | |
| width=float(page.width), | |
| height=float(page.height) | |
| )) | |
| return pages | |
| def extract_metadata(filepath: str, pages: List[PageData]) -> PaperMetadata: | |
| """Extract metadata using a layered heuristic approach.""" | |
| doc = fitz.open(filepath) | |
| meta = doc.metadata | |
| title = (meta.get("title") or "").strip() | |
| authors = (meta.get("author") or "").strip() | |
| year = "" | |
| # Try year from creationDate (format: D:YYYYMMDDHHmmSSZ) | |
| cdate = meta.get("creationDate") or "" | |
| if cdate and cdate.startswith("D:"): | |
| year = cdate[2:6] | |
| # Heuristic 1: title from first page largest text | |
| if not title or len(title) < 5 or "Microsoft Word" in title: | |
| first_page = doc[0] | |
| blocks = first_page.get_text("dict")["blocks"] | |
| title_candidates = [] | |
| for b in blocks: | |
| if b["type"] == 0: | |
| for l in b["lines"]: | |
| for s in l["spans"]: | |
| title_candidates.append((s["text"], s["size"])) | |
| if title_candidates: | |
| # Get largest font size text | |
| title_candidates.sort(key=lambda x: x[1], reverse=True) | |
| best_title = " ".join([t[0] for t in title_candidates if t[1] == title_candidates[0][1]]) | |
| title = best_title.strip() | |
| # Heuristic 2: year from regex on first page | |
| if not year and pages: | |
| match = re.search(r"(19|20)\d{2}", pages[0].text) | |
| if match: | |
| year = match.group(0) | |
| # Heuristic 3: authors from first page text before abstract | |
| if not authors and pages: | |
| lines = pages[0].text.split("\n") | |
| author_lines = [] | |
| for line in lines: | |
| if re.match(r"^abstract$", line.strip(), re.IGNORECASE): | |
| break | |
| if line.strip() and line.strip() != title: | |
| # Add if looks like author line (commas, university, emails) | |
| if "," in line or "University" in line or "@" in line: | |
| author_lines.append(line.strip()) | |
| if author_lines: | |
| authors = "; ".join(author_lines) | |
| doc.close() | |
| return PaperMetadata( | |
| title=title if title else "Unknown Title", | |
| authors=authors if authors else "Unknown Authors", | |
| year=year if year else "Unknown Year", | |
| doi=meta.get("doi") or "Unknown DOI", | |
| n_pages=len(pages), | |
| filepath=filepath | |
| ) | |
| def detect_sections(pages: List[PageData], filepath: str) -> List[PageData]: | |
| """Detect sections using regex and font size heuristics.""" | |
| current_section = "Abstract" | |
| font_sizes = {} | |
| all_sizes = [] | |
| for page in pages: | |
| font_sizes.update(page.font_sizes) | |
| all_sizes.extend(page.font_sizes.values()) | |
| median_size = sorted(all_sizes)[len(all_sizes)//2] if all_sizes else 10 | |
| for page in pages: | |
| lines = page.text.split("\n") | |
| for line in lines: | |
| line_clean = line.strip().lower() | |
| if not line_clean: | |
| continue | |
| # Regex match | |
| matched = False | |
| for pattern in SECTION_PATTERNS: | |
| if re.match(pattern, line_clean, re.IGNORECASE): | |
| current_section = line.strip() | |
| matched = True | |
| break | |
| # Font size heuristic | |
| if not matched and line_clean in font_sizes: | |
| size = font_sizes[line_clean] | |
| # If short, single line, and larger font -> likely header | |
| if size > median_size + 1.5 and len(line_clean) < 60: | |
| current_section = line.strip() | |
| page.section = current_section | |
| return pages | |