examinal-api / app /utils /file_parser.py
aryanmirza01's picture
FYP Clean Final Push
419bd6e
"""
Parse PDF, DOCX, and PPTX files into page‑level text dicts.
Returns: List[{"text": str, "page": int | None}]
"""
import logging
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
def parse_pdf(path: str) -> List[Dict[str, Any]]:
import pdfplumber
pages: List[Dict[str, Any]] = []
with pdfplumber.open(path) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text() or ""
# Also try extracting tables
tables = page.extract_tables() or []
for table in tables:
for row in table:
if row:
text += "\n" + " | ".join(str(cell or "") for cell in row)
pages.append({"text": text, "page": i + 1})
logger.info("Parsed PDF %s: %d pages", path, len(pages))
return pages
def parse_docx(path: str) -> List[Dict[str, Any]]:
from docx import Document
doc = Document(path)
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
full_text.append(row_text)
combined = "\n".join(full_text)
logger.info("Parsed DOCX %s: %d chars", path, len(combined))
return [{"text": combined, "page": None}]
def parse_pptx(path: str) -> List[Dict[str, Any]]:
from pptx import Presentation
prs = Presentation(path)
pages: List[Dict[str, Any]] = []
for i, slide in enumerate(prs.slides):
texts = []
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text.strip()
if text:
texts.append(text)
if shape.has_table:
for row in shape.table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
texts.append(row_text)
page_text = "\n".join(texts)
if page_text:
pages.append({"text": page_text, "page": i + 1})
logger.info("Parsed PPTX %s: %d slides", path, len(pages))
return pages