Spaces:

aryanmirza01
/

examinal-api

Configuration error

App Files Files Community

examinal-api / app /utils /file_parser.py

aryanmirza01

FYP Clean Final Push

419bd6e about 2 months ago

raw

history blame contribute delete

2.42 kB

	"""
	Parse PDF, DOCX, and PPTX files into page‑level text dicts.
	Returns: List[{"text": str, "page": int \| None}]
	"""

	import logging
	from typing import List, Dict, Any

	logger = logging.getLogger(__name__)


	def parse_pdf(path: str) -> List[Dict[str, Any]]:
	import pdfplumber

	pages: List[Dict[str, Any]] = []
	with pdfplumber.open(path) as pdf:
	for i, page in enumerate(pdf.pages):
	text = page.extract_text() or ""
	# Also try extracting tables
	tables = page.extract_tables() or []
	for table in tables:
	for row in table:
	if row:
	text += "\n" + " \| ".join(str(cell or "") for cell in row)
	pages.append({"text": text, "page": i + 1})
	logger.info("Parsed PDF %s: %d pages", path, len(pages))
	return pages


	def parse_docx(path: str) -> List[Dict[str, Any]]:
	from docx import Document

	doc = Document(path)
	full_text = []
	for para in doc.paragraphs:
	if para.text.strip():
	full_text.append(para.text)

	# Also extract text from tables
	for table in doc.tables:
	for row in table.rows:
	row_text = " \| ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
	if row_text:
	full_text.append(row_text)

	combined = "\n".join(full_text)
	logger.info("Parsed DOCX %s: %d chars", path, len(combined))
	return [{"text": combined, "page": None}]


	def parse_pptx(path: str) -> List[Dict[str, Any]]:
	from pptx import Presentation

	prs = Presentation(path)
	pages: List[Dict[str, Any]] = []
	for i, slide in enumerate(prs.slides):
	texts = []
	for shape in slide.shapes:
	if shape.has_text_frame:
	for paragraph in shape.text_frame.paragraphs:
	text = paragraph.text.strip()
	if text:
	texts.append(text)
	if shape.has_table:
	for row in shape.table.rows:
	row_text = " \| ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
	if row_text:
	texts.append(row_text)
	page_text = "\n".join(texts)
	if page_text:
	pages.append({"text": page_text, "page": i + 1})
	logger.info("Parsed PPTX %s: %d slides", path, len(pages))
	return pages