synclm-demo / utils /parser.py
SCBconsulting's picture
Create parser.py
021dc25 verified
raw
history blame contribute delete
624 Bytes
import os
import pdfplumber
from docx import Document
def parse_file(file_path):
ext = os.path.splitext(file_path)[-1].lower()
if ext == ".txt":
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
elif ext == ".docx":
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif ext == ".pdf":
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text.strip()
else:
return "Unsupported file format."