Spaces:
Sleeping
Sleeping
Update utils/ingestion.py
Browse files- utils/ingestion.py +10 -1
utils/ingestion.py
CHANGED
|
@@ -29,7 +29,7 @@ class DocumentProcessor:
|
|
| 29 |
def setup_document_converter(self):
|
| 30 |
"""Configure document converter to support multiple formats"""
|
| 31 |
pipeline_options = PdfPipelineOptions()
|
| 32 |
-
pipeline_options.do_ocr =
|
| 33 |
pipeline_options.do_table_structure = True
|
| 34 |
|
| 35 |
self.converter = DocumentConverter(
|
|
@@ -39,6 +39,8 @@ class DocumentProcessor:
|
|
| 39 |
InputFormat.DOCX,
|
| 40 |
InputFormat.HTML,
|
| 41 |
InputFormat.PPTX,
|
|
|
|
|
|
|
| 42 |
],
|
| 43 |
format_options={
|
| 44 |
InputFormat.PDF: PdfFormatOption(
|
|
@@ -64,6 +66,13 @@ class DocumentProcessor:
|
|
| 64 |
print(f"❌ Conversion failed: {e}")
|
| 65 |
return None
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
chunker = HierarchicalChunker()
|
| 68 |
chunks = list(chunker.chunk(doc))
|
| 69 |
|
|
|
|
| 29 |
def setup_document_converter(self):
|
| 30 |
"""Configure document converter to support multiple formats"""
|
| 31 |
pipeline_options = PdfPipelineOptions()
|
| 32 |
+
pipeline_options.do_ocr = False
|
| 33 |
pipeline_options.do_table_structure = True
|
| 34 |
|
| 35 |
self.converter = DocumentConverter(
|
|
|
|
| 39 |
InputFormat.DOCX,
|
| 40 |
InputFormat.HTML,
|
| 41 |
InputFormat.PPTX,
|
| 42 |
+
InputFormat.TXT, # Added text format
|
| 43 |
+
InputFormat.CSV, # Added CSV format
|
| 44 |
],
|
| 45 |
format_options={
|
| 46 |
InputFormat.PDF: PdfFormatOption(
|
|
|
|
| 66 |
print(f"❌ Conversion failed: {e}")
|
| 67 |
return None
|
| 68 |
|
| 69 |
+
# Save document as markdown
|
| 70 |
+
output_dir = Path("parsed-doc")
|
| 71 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 72 |
+
doc_filename = Path(file_path).stem
|
| 73 |
+
md_filename = output_dir / f"{doc_filename}.md"
|
| 74 |
+
doc.save_as_markdown(md_filename)
|
| 75 |
+
|
| 76 |
chunker = HierarchicalChunker()
|
| 77 |
chunks = list(chunker.chunk(doc))
|
| 78 |
|