Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import (CSVLoader, WikipediaLoader, UnstructuredURLLoader, | |
| YoutubeLoader, PyPDFLoader, BSHTMLLoader, | |
| Docx2txtLoader, UnstructuredMarkdownLoader) | |
| from langchain_unstructured import UnstructuredLoader | |
| class DocumentLoader: | |
| def load_unstructured(self, path): | |
| """ | |
| Load data from a file at the specified path: | |
| supported files: | |
| "csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx" | |
| Args: | |
| path (str): The file paths | |
| Returns: | |
| The loaded data. | |
| Exceptions: | |
| Prints an error message if the loading fails. | |
| """ | |
| try: | |
| loader = UnstructuredLoader(path) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading Unstructured: {e}") | |
| def load_csv(self, path): | |
| """ | |
| Load data from a CSV file at the specified path. | |
| Args: | |
| path (str): The file path to the CSV file. | |
| Returns: | |
| The loaded CSV data. | |
| Exceptions: | |
| Prints an error message if the CSV loading fails. | |
| """ | |
| try: | |
| loader = CSVLoader(file_path=path) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading CSV: {e}") | |
| def wikipedia_query(self, search_query): | |
| """ | |
| Query Wikipedia using a given search term and return the results. | |
| Args: | |
| search_query (str): The search term to query on Wikipedia. | |
| Returns: | |
| The query results. | |
| Exceptions: | |
| Prints an error message if the Wikipedia query fails. | |
| """ | |
| try: | |
| data = WikipediaLoader(query=search_query, load_max_docs=2).load() | |
| return data | |
| except Exception as e: | |
| print(f"Error querying Wikipedia: {e}") | |
| def load_urls(self, urls): | |
| """ | |
| Load and parse content from a list of URLs. | |
| Args: | |
| urls (list): A list of URLs to load. | |
| Returns: | |
| The loaded data from the URLs. | |
| Exceptions: | |
| Prints an error message if loading URLs fails. | |
| """ | |
| try: | |
| loader = UnstructuredURLLoader(urls=urls) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading URLs: {e}") | |
| def load_YouTubeVideo(self, urls): | |
| """ | |
| Load YouTube video information from provided URLs. | |
| Args: | |
| urls (list): A list of YouTube video URLs. | |
| Returns: | |
| The loaded documents from the YouTube URLs. | |
| Exceptions: | |
| Prints an error message if loading YouTube videos fails. | |
| """ | |
| try: | |
| loader = YoutubeLoader.from_youtube_url( | |
| urls, add_video_info=True, language=["en", "pt", "zh-Hans", "es", "ur", "hi"], | |
| translation="en") | |
| documents = loader.load() | |
| return documents | |
| except Exception as e: | |
| print(f"Error loading YouTube video: {e}") | |
| def load_pdf(self, path): | |
| """ | |
| Load data from a PDF file at the specified path. | |
| Args: | |
| path (str): The file path to the PDF file. | |
| Returns: | |
| The loaded and split PDF pages. | |
| Exceptions: | |
| Prints an error message if the PDF loading fails. | |
| """ | |
| try: | |
| loader = PyPDFLoader(path) | |
| pages = loader.load_and_split() | |
| return pages | |
| except Exception as e: | |
| print(f"Error loading PDF: {e}") | |
| def load_text_from_html(self, path): | |
| """ | |
| Load and parse text content from an HTML file at the specified path. | |
| Args: | |
| path (str): The file path to the HTML file. | |
| Returns: | |
| The loaded HTML data. | |
| Exceptions: | |
| Prints an error message if loading text from HTML fails. | |
| """ | |
| try: | |
| loader = BSHTMLLoader(path) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading text from HTML: {e}") | |
| def load_markdown(self, path): | |
| """ | |
| Load data from a Markdown file at the specified path. | |
| Args: | |
| path (str): The file path to the Markdown file. | |
| Returns: | |
| The loaded Markdown data. | |
| Exceptions: | |
| Prints an error message if loading Markdown fails. | |
| """ | |
| try: | |
| loader = UnstructuredMarkdownLoader(path) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading Markdown: {e}") | |
| def load_doc(self, path): | |
| """ | |
| Load data from a DOCX file at the specified path. | |
| Args: | |
| path (str): The file path to the DOCX file. | |
| Returns: | |
| The loaded DOCX data. | |
| Exceptions: | |
| Prints an error message if loading DOCX fails. | |
| """ | |
| try: | |
| loader = Docx2txtLoader(path) | |
| data = loader.load() | |
| return data | |
| except Exception as e: | |
| print(f"Error loading DOCX: {e}") | |