Spaces:

NEXAS
/

docling_rag

Sleeping

NEXAS commited on Mar 2, 2025

Commit

0a394f8

verified ·

1 Parent(s): 4a97e8c

Update utils/ingestion.py

Files changed (1) hide show

utils/ingestion.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DocumentProcessor:
     def setup_document_converter(self):
         """Configure document converter to support multiple formats"""
         pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_ocr = True
         pipeline_options.do_table_structure = True
         self.converter = DocumentConverter(
@@ -39,6 +39,8 @@ class DocumentProcessor:
                 InputFormat.DOCX,
                 InputFormat.HTML,
                 InputFormat.PPTX,
             ],
             format_options={
                 InputFormat.PDF: PdfFormatOption(
@@ -64,6 +66,13 @@ class DocumentProcessor:
             print(f"❌ Conversion failed: {e}")
             return None
         chunker = HierarchicalChunker()
         chunks = list(chunker.chunk(doc))

     def setup_document_converter(self):
         """Configure document converter to support multiple formats"""
         pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_ocr = False
         pipeline_options.do_table_structure = True
         self.converter = DocumentConverter(
                 InputFormat.DOCX,
                 InputFormat.HTML,
                 InputFormat.PPTX,
+                InputFormat.TXT,  # Added text format
+                InputFormat.CSV,  # Added CSV format
             ],
             format_options={
                 InputFormat.PDF: PdfFormatOption(
             print(f"❌ Conversion failed: {e}")
             return None
+        # Save document as markdown
+        output_dir = Path("parsed-doc")
+        output_dir.mkdir(parents=True, exist_ok=True)
+        doc_filename = Path(file_path).stem
+        md_filename = output_dir / f"{doc_filename}.md"
+        doc.save_as_markdown(md_filename)
         chunker = HierarchicalChunker()
         chunks = list(chunker.chunk(doc))