Spaces:

Ronochieng
/

DocMindAI

Build error

App Files Files Community

Ronochieng commited on Apr 27, 2025

Commit

2e83ef9

verified ·

1 Parent(s): f0d60d1

Update Ingestion/ingest.py

Browse files

Files changed (1) hide show

Ingestion/ingest.py +110 -107

Ingestion/ingest.py CHANGED Viewed

@@ -4,18 +4,21 @@ import pandas as pd
 import tempfile
 from typing import Dict, Any, Optional, List
-# Import unstructured components for different file types
-from unstructured.partition.auto import partition
-from unstructured.partition.pdf import partition_pdf
-from unstructured.partition.docx import partition_docx
-from unstructured.partition.pptx import partition_pptx
-from unstructured.partition.xlsx import partition_xlsx
-from unstructured.partition.md import partition_md
-from unstructured.partition.html import partition_html
-from unstructured.partition.xml import partition_xml
-from unstructured.partition.email import partition_email
-from unstructured.partition.text import partition_text
-from unstructured.partition.epub import partition_epub
 def get_processor_for_file(file_path: str) -> Optional[callable]:
     """
@@ -23,7 +26,7 @@ def get_processor_for_file(file_path: str) -> Optional[callable]:
     """
     file_extension = os.path.splitext(file_path)[1].lower()
-    # Map file extensions to specific partition functions
     processors = {
         ".pdf": process_pdf,
         ".docx": process_docx,
@@ -40,7 +43,7 @@ def get_processor_for_file(file_path: str) -> Optional[callable]:
         ".eml": process_email,
         ".epub": process_epub,
         ".txt": process_text,
-        ".csv": process_text,
         ".rtf": process_text,
         # Code files
@@ -75,183 +78,183 @@ def process_document(file_path: str) -> Optional[str]:
 def process_pdf(file_path: str) -> str:
     """
-    Process PDF documents using unstructured
     """
-    temp_dir = tempfile.mkdtemp()
-    try:
-        # Try hi_res mode first with OCR capabilities
-        elements = partition_pdf(
-            filename=file_path,
-            strategy="hi_res",
-            extract_images_in_pdf=True,
-            extract_image_block_types=["Image", "Table"],
-            extract_image_block_to_payload=False,
-            extract_image_block_output_dir=temp_dir,
-            hi_res_model_name="yolox",
-            infer_table_structure=True,
-            chunking_strategy="by_title",
-            max_characters=4000,
-            new_after_n_chars=3800,
-            combine_text_under_n_chars=2000,
-        )
-    except Exception as e:
-        # Fall back to fast mode if hi_res fails
-        elements = partition_pdf(
-            filename=file_path,
-            strategy="fast",
-            chunking_strategy="by_title",
-            max_characters=4000,
-            new_after_n_chars=3800,
-            combine_text_under_n_chars=2000,
-        )
-    # Extract text from elements
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
-    combined_text = "\n\n".join(texts)
-    return combined_text
 def process_docx(file_path: str) -> str:
     """
-    Process DOCX documents using unstructured
     """
-    elements = partition_docx(
-        filename=file_path,
-        chunking_strategy="by_title",
-        max_characters=4000,
-        new_after_n_chars=3800,
-        combine_text_under_n_chars=2000,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_pptx(file_path: str) -> str:
     """
-    Process PPTX documents using unstructured
     """
-    elements = partition_pptx(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_xlsx(file_path: str) -> str:
     """
-    Process XLSX documents using unstructured
     """
-    elements = partition_xlsx(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_markdown(file_path: str) -> str:
     """
-    Process Markdown documents using unstructured
     """
-    elements = partition_md(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_html(file_path: str) -> str:
     """
-    Process HTML documents using unstructured
     """
-    elements = partition_html(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_xml(file_path: str) -> str:
     """
-    Process XML documents using unstructured
     """
-    elements = partition_xml(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_email(file_path: str) -> str:
     """
-    Process email documents using unstructured
     """
-    elements = partition_email(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_text(file_path: str) -> str:
     """
-    Process text documents using unstructured
     """
-    elements = partition_text(
-        filename=file_path,
-        chunking_strategy="by_title",
-        max_characters=4000,
-        new_after_n_chars=3800,
-        combine_text_under_n_chars=2000,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
-    combined_text = "\n\n".join(texts)
-    return combined_text
 def process_epub(file_path: str) -> str:
     """
-    Process EPUB documents using unstructured
     """
-    elements = partition_epub(
-        filename=file_path,
-    )
-    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_generic(file_path: str) -> str:
     """
-    Generic document processor using unstructured's auto partitioning
     """
     try:
-        elements = partition(
-            filename=file_path,
-        )
-        texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
         combined_text = "\n\n".join(texts)
         return combined_text
     except Exception as e:
-        # Fall back to basic text processing if auto-partition fails
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 return f.read()

 import tempfile
 from typing import Dict, Any, Optional, List
+# Import Langchain document loaders
+from langchain_community.document_loaders import (
+    PyMuPDFLoader,
+    UnstructuredWordDocumentLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredExcelLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredXMLLoader,
+    UnstructuredEmailLoader,
+    UnstructuredFileLoader,
+    UnstructuredEPubLoader,
+    CSVLoader,
+    TextLoader
+)
 def get_processor_for_file(file_path: str) -> Optional[callable]:
     """
     """
     file_extension = os.path.splitext(file_path)[1].lower()
+    # Map file extensions to specific processor functions
     processors = {
         ".pdf": process_pdf,
         ".docx": process_docx,
         ".eml": process_email,
         ".epub": process_epub,
         ".txt": process_text,
+        ".csv": process_csv,
         ".rtf": process_text,
         # Code files
 def process_pdf(file_path: str) -> str:
     """
+    Process PDF documents using pymupdf4llm for better PDF handling
     """
+    # For PDFs, we'll still use pymupdf4llm as it handles tables and images better
+    pdf_processor = pymupdf4llm.PdfProcessor(file_path)
+    # Extract text, tables, and images
+    extracted_text = pdf_processor.extract_text()
+    extracted_tables = pdf_processor.extract_tables()
+    extracted_images = pdf_processor.extract_images()
+    # Combine extracted content
+    combined_content = []
+    if extracted_text:
+        combined_content.append(extracted_text)
+    if extracted_tables:
+        for table in extracted_tables:
+            combined_content.append(str(table))
+    if extracted_images:
+        combined_content.append(f"Extracted {len(extracted_images)} images.")
+    return "\n\n".join(combined_content)
 def process_docx(file_path: str) -> str:
     """
+    Process DOCX documents using Langchain's UnstructuredWordDocumentLoader
     """
+    loader = UnstructuredWordDocumentLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_pptx(file_path: str) -> str:
     """
+    Process PPTX documents using Langchain's UnstructuredPowerPointLoader
     """
+    loader = UnstructuredPowerPointLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_xlsx(file_path: str) -> str:
     """
+    Process XLSX documents using Langchain's UnstructuredExcelLoader
     """
+    loader = UnstructuredExcelLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_markdown(file_path: str) -> str:
     """
+    Process Markdown documents using Langchain's UnstructuredMarkdownLoader
     """
+    loader = UnstructuredMarkdownLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_html(file_path: str) -> str:
     """
+    Process HTML documents using Langchain's UnstructuredHTMLLoader
     """
+    loader = UnstructuredHTMLLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_xml(file_path: str) -> str:
     """
+    Process XML documents using Langchain's UnstructuredXMLLoader
     """
+    loader = UnstructuredXMLLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_email(file_path: str) -> str:
     """
+    Process email documents using Langchain's UnstructuredEmailLoader
     """
+    loader = UnstructuredEmailLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_text(file_path: str) -> str:
     """
+    Process text documents using Langchain's TextLoader
+    """
+    loader = TextLoader(file_path, encoding="utf-8")
+    try:
+        docs = loader.load()
+        texts = [doc.page_content for doc in docs if doc.page_content]
+        combined_text = "\n\n".join(texts)
+        return combined_text
+    except UnicodeDecodeError:
+        # Try with a different encoding if utf-8 fails
+        loader = TextLoader(file_path, encoding="latin-1")
+        docs = loader.load()
+        texts = [doc.page_content for doc in docs if doc.page_content]
+        combined_text = "\n\n".join(texts)
+        return combined_text
+def process_csv(file_path: str) -> str:
+    """
+    Process CSV documents using Langchain's CSVLoader
     """
+    loader = CSVLoader(file_path)
+    docs = loader.load()
+    # Create a formatted string representation of the CSV data
+    rows = []
+    if docs:
+        # Get column names from metadata if available
+        if hasattr(docs[0], 'metadata') and 'columns' in docs[0].metadata:
+            rows.append(",".join(docs[0].metadata['columns']))
+        # Add content rows
+        for doc in docs:
+            rows.append(doc.page_content)
+    return "\n".join(rows)
 def process_epub(file_path: str) -> str:
     """
+    Process EPUB documents using Langchain's UnstructuredEPubLoader
     """
+    loader = UnstructuredEPubLoader(file_path)
+    docs = loader.load()
+    texts = [doc.page_content for doc in docs if doc.page_content]
     combined_text = "\n\n".join(texts)
     return combined_text
 def process_generic(file_path: str) -> str:
     """
+    Generic document processor using Langchain's UnstructuredFileLoader
     """
     try:
+        loader = UnstructuredFileLoader(file_path)
+        docs = loader.load()
+        texts = [doc.page_content for doc in docs if doc.page_content]
         combined_text = "\n\n".join(texts)
         return combined_text
     except Exception as e:
+        # Fall back to basic text processing if UnstructuredFileLoader fails
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 return f.read()