Spaces:
Running
Running
File size: 1,181 Bytes
c1d887c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | from docx import Document
from io import BytesIO
import logging
logger = logging.getLogger(__name__)
def parse_docx_structure(content: bytes):
"""
Extract headings and paragraphs from DOCX while preserving hierarchy.
Preserves Filipino/English (Taglish) content as-is.
"""
try:
doc = Document(BytesIO(content))
elements = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
# Basic style detection for headings
style_name = para.style.name
is_heading = any(h in style_name for h in ['Heading', 'Title', 'Heading 1', 'Heading 2', 'Heading 3'])
elements.append({
"text": text,
"style": style_name,
"is_heading": is_heading,
"metadata": {
"bold": any(run.bold for run in para.runs),
"italic": any(run.italic for run in para.runs)
}
})
return elements
except Exception as e:
logger.error(f"Error parsing DOCX: {e}")
raise
|