Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader # Updated imports | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import os | |
| import glob | |
| def load_and_split(): | |
| """ | |
| Load documents from various sources and split into chunks with metadata preservation. | |
| Returns: | |
| list: List of Document objects with text and metadata | |
| """ | |
| documents = [] | |
| # Load all PDFs in data folder | |
| pdf_files = glob.glob("data/*.pdf") | |
| for file in pdf_files: | |
| loader = PyPDFLoader(file) | |
| docs = loader.load() | |
| # Add file source to metadata | |
| for doc in docs: | |
| doc.metadata["source"] = os.path.basename(file) | |
| documents.extend(docs) | |
| # Load all text files | |
| txt_files = glob.glob("data/*.txt") | |
| for file in txt_files: | |
| loader = TextLoader(file) | |
| docs = loader.load() | |
| # Add metadata | |
| for doc in docs: | |
| doc.metadata["source"] = os.path.basename(file) | |
| documents.extend(docs) | |
| # Load all CSV files | |
| csv_files = glob.glob("data/*.csv") | |
| for file in csv_files: | |
| try: | |
| loader = CSVLoader(file) | |
| docs = loader.load() | |
| # Add metadata | |
| for doc in docs: | |
| doc.metadata["source"] = os.path.basename(file) | |
| documents.extend(docs) | |
| except Exception as e: | |
| print(f"Error loading CSV file {file}: {e}") | |
| print(f"Loaded {len(documents)} documents from {len(pdf_files)} PDFs, {len(txt_files)} TXTs, and {len(csv_files)} CSVs") | |
| # Split documents into manageable chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, # Increased chunk size for better context | |
| chunk_overlap=200, # Increased overlap | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| split_docs = text_splitter.split_documents(documents) | |
| print(f"Split into {len(split_docs)} chunks") | |
| return split_docs | |
| if __name__ == "__main__": | |
| documents = load_and_split() | |
| print(f"Loaded and split {len(documents)} text chunks from all documents!") |