Spaces:

SCBconsulting
/

synclm-demo

Sleeping

synclm-demo / utils /parser.py

Create parser.py

021dc25 verified 5 months ago

624 Bytes

	import os
	import pdfplumber
	from docx import Document

	def parse_file(file_path):
	ext = os.path.splitext(file_path)[-1].lower()

	if ext == ".txt":
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()

	elif ext == ".docx":
	doc = Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])

	elif ext == ".pdf":
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text.strip()

	else:
	return "Unsupported file format."