Spaces:
Build error
Build error
| import os | |
| import pymupdf4llm | |
| import pandas as pd | |
| import tempfile | |
| from typing import Dict, Any, Optional, List | |
| # Import Langchain document loaders | |
| from langchain_community.document_loaders import ( | |
| PyMuPDFLoader, | |
| UnstructuredWordDocumentLoader, | |
| UnstructuredPowerPointLoader, | |
| UnstructuredExcelLoader, | |
| UnstructuredMarkdownLoader, | |
| UnstructuredHTMLLoader, | |
| UnstructuredXMLLoader, | |
| UnstructuredEmailLoader, | |
| UnstructuredFileLoader, | |
| UnstructuredEPubLoader, | |
| CSVLoader, | |
| TextLoader | |
| ) | |
| def get_processor_for_file(file_path: str) -> Optional[callable]: | |
| """ | |
| Determine the appropriate processor function for the given file type | |
| """ | |
| file_extension = os.path.splitext(file_path)[1].lower() | |
| # Map file extensions to specific processor functions | |
| processors = { | |
| ".pdf": process_pdf, | |
| ".docx": process_docx, | |
| ".doc": process_docx, | |
| ".pptx": process_pptx, | |
| ".ppt": process_pptx, | |
| ".xlsx": process_xlsx, | |
| ".xls": process_xlsx, | |
| ".md": process_markdown, | |
| ".html": process_html, | |
| ".htm": process_html, | |
| ".xml": process_xml, | |
| ".msg": process_email, | |
| ".eml": process_email, | |
| ".epub": process_epub, | |
| ".txt": process_text, | |
| ".csv": process_csv, | |
| ".rtf": process_text, | |
| # Code files | |
| ".py": process_text, | |
| ".js": process_text, | |
| ".java": process_text, | |
| ".ts": process_text, | |
| ".tsx": process_text, | |
| ".jsx": process_text, | |
| ".c": process_text, | |
| ".cpp": process_text, | |
| ".h": process_text, | |
| ".cs": process_text, | |
| ".rb": process_text, | |
| ".go": process_text, | |
| ".rs": process_text, | |
| ".php": process_text, | |
| ".sql": process_text, | |
| ".css": process_text, | |
| } | |
| return processors.get(file_extension, process_generic) | |
| def process_document(file_path: str) -> Optional[str]: | |
| """ | |
| Process a document using the appropriate processor based on file type | |
| """ | |
| processor = get_processor_for_file(file_path) | |
| if processor: | |
| return processor(file_path) | |
| return None | |
| def process_pdf(file_path: str) -> str: | |
| """ | |
| Process PDF documents using pymupdf4llm for better PDF handling | |
| """ | |
| # For PDFs, we'll still use pymupdf4llm as it handles tables and images better | |
| pdf_processor = pymupdf4llm.PdfProcessor(file_path) | |
| # Extract text, tables, and images | |
| extracted_text = pdf_processor.extract_text() | |
| extracted_tables = pdf_processor.extract_tables() | |
| extracted_images = pdf_processor.extract_images() | |
| # Combine extracted content | |
| combined_content = [] | |
| if extracted_text: | |
| combined_content.append(extracted_text) | |
| if extracted_tables: | |
| for table in extracted_tables: | |
| combined_content.append(str(table)) | |
| if extracted_images: | |
| combined_content.append(f"Extracted {len(extracted_images)} images.") | |
| return "\n\n".join(combined_content) | |
| def process_docx(file_path: str) -> str: | |
| """ | |
| Process DOCX documents using Langchain's UnstructuredWordDocumentLoader | |
| """ | |
| loader = UnstructuredWordDocumentLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_pptx(file_path: str) -> str: | |
| """ | |
| Process PPTX documents using Langchain's UnstructuredPowerPointLoader | |
| """ | |
| loader = UnstructuredPowerPointLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_xlsx(file_path: str) -> str: | |
| """ | |
| Process XLSX documents using Langchain's UnstructuredExcelLoader | |
| """ | |
| loader = UnstructuredExcelLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_markdown(file_path: str) -> str: | |
| """ | |
| Process Markdown documents using Langchain's UnstructuredMarkdownLoader | |
| """ | |
| loader = UnstructuredMarkdownLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_html(file_path: str) -> str: | |
| """ | |
| Process HTML documents using Langchain's UnstructuredHTMLLoader | |
| """ | |
| loader = UnstructuredHTMLLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_xml(file_path: str) -> str: | |
| """ | |
| Process XML documents using Langchain's UnstructuredXMLLoader | |
| """ | |
| loader = UnstructuredXMLLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_email(file_path: str) -> str: | |
| """ | |
| Process email documents using Langchain's UnstructuredEmailLoader | |
| """ | |
| loader = UnstructuredEmailLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_text(file_path: str) -> str: | |
| """ | |
| Process text documents using Langchain's TextLoader | |
| """ | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| try: | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| except UnicodeDecodeError: | |
| # Try with a different encoding if utf-8 fails | |
| loader = TextLoader(file_path, encoding="latin-1") | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_csv(file_path: str) -> str: | |
| """ | |
| Process CSV documents using Langchain's CSVLoader | |
| """ | |
| loader = CSVLoader(file_path) | |
| docs = loader.load() | |
| # Create a formatted string representation of the CSV data | |
| rows = [] | |
| if docs: | |
| # Get column names from metadata if available | |
| if hasattr(docs[0], 'metadata') and 'columns' in docs[0].metadata: | |
| rows.append(",".join(docs[0].metadata['columns'])) | |
| # Add content rows | |
| for doc in docs: | |
| rows.append(doc.page_content) | |
| return "\n".join(rows) | |
| def process_epub(file_path: str) -> str: | |
| """ | |
| Process EPUB documents using Langchain's UnstructuredEPubLoader | |
| """ | |
| loader = UnstructuredEPubLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| def process_generic(file_path: str) -> str: | |
| """ | |
| Generic document processor using Langchain's UnstructuredFileLoader | |
| """ | |
| try: | |
| loader = UnstructuredFileLoader(file_path) | |
| docs = loader.load() | |
| texts = [doc.page_content for doc in docs if doc.page_content] | |
| combined_text = "\n\n".join(texts) | |
| return combined_text | |
| except Exception as e: | |
| # Fall back to basic text processing if UnstructuredFileLoader fails | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except Exception: | |
| # Try with a different encoding if utf-8 fails | |
| try: | |
| with open(file_path, 'r', encoding='latin-1') as f: | |
| return f.read() | |
| except Exception as e2: | |
| raise Exception(f"Could not process file: {str(e)} / {str(e2)}") |