Spaces:
Sleeping
Sleeping
| """ | |
| DOCX text extraction using python-docx. | |
| Extracts text preserving paragraph structure, tables, and document properties. | |
| """ | |
| import time | |
| import os | |
| from docx import Document | |
| from models.schemas import ExtractionResult, DocumentMetadata | |
| def extract_docx(file_path: str) -> ExtractionResult: | |
| """Extract text and metadata from a DOCX file.""" | |
| start_time = time.time() | |
| try: | |
| doc = Document(file_path) | |
| # Extract paragraphs | |
| paragraphs = [] | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if text: | |
| # Preserve heading structure | |
| if para.style and para.style.name.startswith("Heading"): | |
| level = para.style.name.replace("Heading ", "").strip() | |
| prefix = "#" * int(level) if level.isdigit() else "##" | |
| paragraphs.append(f"{prefix} {text}") | |
| else: | |
| paragraphs.append(text) | |
| # Extract tables | |
| tables_text = [] | |
| for table_idx, table in enumerate(doc.tables): | |
| table_data = [] | |
| for row in table.rows: | |
| row_data = [cell.text.strip() for cell in row.cells] | |
| table_data.append(" | ".join(row_data)) | |
| if table_data: | |
| tables_text.append(f"\n[Table {table_idx + 1}]\n" + "\n".join(table_data)) | |
| # Combine all text | |
| full_text = "\n\n".join(paragraphs) | |
| if tables_text: | |
| full_text += "\n\n" + "\n".join(tables_text) | |
| # Extract metadata from core properties | |
| props = doc.core_properties | |
| metadata = DocumentMetadata( | |
| title=props.title or os.path.basename(file_path), | |
| author=props.author or "Unknown", | |
| creation_date=str(props.created) if props.created else "", | |
| modification_date=str(props.modified) if props.modified else "", | |
| page_count=None, # DOCX doesn't expose page count easily | |
| word_count=len(full_text.split()) if full_text else 0, | |
| character_count=len(full_text), | |
| file_type="DOCX", | |
| extra={ | |
| "category": props.category or "", | |
| "comments": props.comments or "", | |
| "last_modified_by": props.last_modified_by or "", | |
| "revision": props.revision, | |
| "subject": props.subject or "", | |
| "keywords": props.keywords or "", | |
| "paragraph_count": len(doc.paragraphs), | |
| "table_count": len(doc.tables), | |
| } | |
| ) | |
| elapsed = (time.time() - start_time) * 1000 | |
| if not full_text.strip(): | |
| return ExtractionResult( | |
| raw_text="", | |
| metadata=metadata, | |
| success=False, | |
| error_message="No text content found in the DOCX file.", | |
| extraction_time_ms=elapsed, | |
| ) | |
| return ExtractionResult( | |
| raw_text=full_text, | |
| metadata=metadata, | |
| success=True, | |
| extraction_time_ms=elapsed, | |
| ) | |
| except Exception as e: | |
| elapsed = (time.time() - start_time) * 1000 | |
| return ExtractionResult( | |
| raw_text="", | |
| metadata=DocumentMetadata(file_type="DOCX"), | |
| success=False, | |
| error_message=f"DOCX extraction failed: {str(e)}", | |
| extraction_time_ms=elapsed, | |
| ) | |