|
|
from langchain_unstructured import UnstructuredLoader |
|
|
|
|
|
from langflow.base.data import BaseFileComponent |
|
|
from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput |
|
|
from langflow.schema import Data |
|
|
|
|
|
|
|
|
class UnstructuredComponent(BaseFileComponent): |
|
|
display_name = "Unstructured API" |
|
|
description = ( |
|
|
"Uses Unstructured.io API to extract clean text from raw source documents. " |
|
|
"Supports a wide range of file types." |
|
|
) |
|
|
documentation = ( |
|
|
"https://python.langchain.com/api_reference/unstructured/document_loaders/" |
|
|
"langchain_unstructured.document_loaders.UnstructuredLoader.html" |
|
|
) |
|
|
trace_type = "tool" |
|
|
icon = "Unstructured" |
|
|
name = "Unstructured" |
|
|
|
|
|
|
|
|
VALID_EXTENSIONS = [ |
|
|
"bmp", |
|
|
"csv", |
|
|
"doc", |
|
|
"docx", |
|
|
"eml", |
|
|
"epub", |
|
|
"heic", |
|
|
"html", |
|
|
"jpeg", |
|
|
"png", |
|
|
"md", |
|
|
"msg", |
|
|
"odt", |
|
|
"org", |
|
|
"p7s", |
|
|
"pdf", |
|
|
"png", |
|
|
"ppt", |
|
|
"pptx", |
|
|
"rst", |
|
|
"rtf", |
|
|
"tiff", |
|
|
"txt", |
|
|
"tsv", |
|
|
"xls", |
|
|
"xlsx", |
|
|
"xml", |
|
|
] |
|
|
|
|
|
inputs = [ |
|
|
*BaseFileComponent._base_inputs, |
|
|
SecretStrInput( |
|
|
name="api_key", |
|
|
display_name="Unstructured.io Serverless API Key", |
|
|
required=True, |
|
|
info="Unstructured API Key. Create at: https://app.unstructured.io/", |
|
|
), |
|
|
MessageTextInput( |
|
|
name="api_url", |
|
|
display_name="Unstructured.io API URL", |
|
|
required=False, |
|
|
info="Unstructured API URL.", |
|
|
), |
|
|
DropdownInput( |
|
|
name="chunking_strategy", |
|
|
display_name="Chunking Strategy", |
|
|
info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking", |
|
|
options=["", "basic", "by_title", "by_page", "by_similarity"], |
|
|
real_time_refresh=False, |
|
|
value="", |
|
|
), |
|
|
NestedDictInput( |
|
|
name="unstructured_args", |
|
|
display_name="Additional Arguments", |
|
|
required=False, |
|
|
info=( |
|
|
"Optional dictionary of additional arguments to the Loader. " |
|
|
"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information." |
|
|
), |
|
|
), |
|
|
] |
|
|
|
|
|
outputs = [ |
|
|
*BaseFileComponent._base_outputs, |
|
|
] |
|
|
|
|
|
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]: |
|
|
file_paths = [str(file.path) for file in file_list if file.path] |
|
|
|
|
|
if not file_paths: |
|
|
self.log("No files to process.") |
|
|
return file_list |
|
|
|
|
|
|
|
|
args = self.unstructured_args or {} |
|
|
|
|
|
if self.chunking_strategy: |
|
|
args["chunking_strategy"] = self.chunking_strategy |
|
|
|
|
|
args["api_key"] = self.api_key |
|
|
args["partition_via_api"] = True |
|
|
if self.api_url: |
|
|
args["url"] = self.api_url |
|
|
|
|
|
loader = UnstructuredLoader( |
|
|
file_paths, |
|
|
**args, |
|
|
) |
|
|
|
|
|
documents = loader.load() |
|
|
|
|
|
processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents] |
|
|
|
|
|
|
|
|
for data in processed_data: |
|
|
if data and "source" in data.data: |
|
|
data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source") |
|
|
|
|
|
return self.rollup_data(file_list, processed_data) |
|
|
|