Spaces:

mominah
/

Chatbot-backend

Sleeping

App Files Files Community

Chatbot-backend / document_loaders.py

mominah

Upload 11 files

7b7cab6 verified 12 months ago

raw

history blame contribute delete

5.41 kB

	from langchain_community.document_loaders import (CSVLoader, WikipediaLoader, UnstructuredURLLoader,
	YoutubeLoader, PyPDFLoader, BSHTMLLoader,
	Docx2txtLoader, UnstructuredMarkdownLoader)

	from langchain_unstructured import UnstructuredLoader


	class DocumentLoader:
	def load_unstructured(self, path):
	"""
	Load data from a file at the specified path:

	supported files:
	"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx"


	Args:
	path (str): The file paths

	Returns:
	The loaded data.

	Exceptions:
	Prints an error message if the loading fails.
	"""
	try:
	loader = UnstructuredLoader(path)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading Unstructured: {e}")

	def load_csv(self, path):
	"""
	Load data from a CSV file at the specified path.

	Args:
	path (str): The file path to the CSV file.

	Returns:
	The loaded CSV data.

	Exceptions:
	Prints an error message if the CSV loading fails.
	"""
	try:
	loader = CSVLoader(file_path=path)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading CSV: {e}")

	def wikipedia_query(self, search_query):
	"""
	Query Wikipedia using a given search term and return the results.

	Args:
	search_query (str): The search term to query on Wikipedia.

	Returns:
	The query results.

	Exceptions:
	Prints an error message if the Wikipedia query fails.
	"""
	try:
	data = WikipediaLoader(query=search_query, load_max_docs=2).load()
	return data
	except Exception as e:
	print(f"Error querying Wikipedia: {e}")

	def load_urls(self, urls):
	"""
	Load and parse content from a list of URLs.

	Args:
	urls (list): A list of URLs to load.

	Returns:
	The loaded data from the URLs.

	Exceptions:
	Prints an error message if loading URLs fails.
	"""
	try:
	loader = UnstructuredURLLoader(urls=urls)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading URLs: {e}")

	def load_YouTubeVideo(self, urls):
	"""
	Load YouTube video information from provided URLs.

	Args:
	urls (list): A list of YouTube video URLs.

	Returns:
	The loaded documents from the YouTube URLs.

	Exceptions:
	Prints an error message if loading YouTube videos fails.
	"""
	try:
	loader = YoutubeLoader.from_youtube_url(
	urls, add_video_info=True, language=["en", "pt", "zh-Hans", "es", "ur", "hi"],
	translation="en")
	documents = loader.load()
	return documents
	except Exception as e:
	print(f"Error loading YouTube video: {e}")

	def load_pdf(self, path):
	"""
	Load data from a PDF file at the specified path.

	Args:
	path (str): The file path to the PDF file.

	Returns:
	The loaded and split PDF pages.

	Exceptions:
	Prints an error message if the PDF loading fails.
	"""
	try:
	loader = PyPDFLoader(path)
	pages = loader.load_and_split()
	return pages
	except Exception as e:
	print(f"Error loading PDF: {e}")

	def load_text_from_html(self, path):
	"""
	Load and parse text content from an HTML file at the specified path.

	Args:
	path (str): The file path to the HTML file.

	Returns:
	The loaded HTML data.

	Exceptions:
	Prints an error message if loading text from HTML fails.
	"""
	try:
	loader = BSHTMLLoader(path)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading text from HTML: {e}")

	def load_markdown(self, path):
	"""
	Load data from a Markdown file at the specified path.

	Args:
	path (str): The file path to the Markdown file.

	Returns:
	The loaded Markdown data.

	Exceptions:
	Prints an error message if loading Markdown fails.
	"""
	try:
	loader = UnstructuredMarkdownLoader(path)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading Markdown: {e}")

	def load_doc(self, path):
	"""
	Load data from a DOCX file at the specified path.

	Args:
	path (str): The file path to the DOCX file.

	Returns:
	The loaded DOCX data.

	Exceptions:
	Prints an error message if loading DOCX fails.
	"""
	try:
	loader = Docx2txtLoader(path)
	data = loader.load()
	return data
	except Exception as e:
	print(f"Error loading DOCX: {e}")