Spaces:

hellorahulk
/

docling_free

Running

hellorahulk commited on Jan 23, 2025

Commit

070e4b3

1 Parent(s): 1880d31

Fix document text extraction using proper Docling methods

Files changed (1) hide show

dockling_parser/parser.py CHANGED Viewed

@@ -63,8 +63,10 @@ class DocumentParser:
             result = self.converter.convert(str(file_path))
             doc = result.document
-            # Extract content and structure
-            content = doc.text
             structured_content = {
                 'sections': doc.sections if hasattr(doc, 'sections') else [],
                 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
@@ -72,6 +74,12 @@ class DocumentParser:
                 'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
             }
             # Update metadata with document-specific information
             if hasattr(doc, 'metadata') and doc.metadata:
                 metadata.title = doc.metadata.get('title')
@@ -82,7 +90,7 @@ class DocumentParser:
             return ParsedDocument(
                 content=content,
                 metadata=metadata,
-                raw_text=doc.raw_text if hasattr(doc, 'raw_text') else None,
                 structured_content=structured_content,
                 confidence_score=getattr(doc, 'confidence', 1.0)
             )

             result = self.converter.convert(str(file_path))
             doc = result.document
+            # Extract content using proper methods
+            content = doc.export_to_text()
+            # Extract structured content
             structured_content = {
                 'sections': doc.sections if hasattr(doc, 'sections') else [],
                 'paragraphs': doc.paragraphs if hasattr(doc, 'paragraphs') else [],
                 'metadata': doc.metadata if hasattr(doc, 'metadata') else {}
             }
+            # Get raw text if available
+            try:
+                raw_text = doc.export_to_text(include_layout=True)
+            except:
+                raw_text = content
             # Update metadata with document-specific information
             if hasattr(doc, 'metadata') and doc.metadata:
                 metadata.title = doc.metadata.get('title')
             return ParsedDocument(
                 content=content,
                 metadata=metadata,
+                raw_text=raw_text,
                 structured_content=structured_content,
                 confidence_score=getattr(doc, 'confidence', 1.0)
             )