Spaces:
Sleeping
Sleeping
Create unstructuredio/unstructured_pdf.py
Browse files
unstructuredio/unstructured_pdf.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from unstructured.partition.pdf import partition_pdf
|
| 2 |
+
import tempfile
|
| 3 |
+
from typing import List, Union, Optional
|
| 4 |
+
from indexify_extractor_sdk import Content, Extractor, Feature
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
class UnstructuredIOConfig(BaseModel):
|
| 8 |
+
strategy: Optional[str] = Field(default="auto") # "auto", "hi_res", "ocr_only", and "fast"
|
| 9 |
+
hi_res_model_name: Optional[str] = Field(default="yolox")
|
| 10 |
+
infer_table_structure: Optional[bool] = True
|
| 11 |
+
|
| 12 |
+
class UnstructuredIOExtractor(Extractor):
|
| 13 |
+
name = "tensorlake/unstructuredio"
|
| 14 |
+
description = "This extractor uses unstructured.io to extract pieces of pdf document into separate plain text content data."
|
| 15 |
+
system_dependencies = ["libmagic-dev", "poppler-utils", "tesseract-ocr"]
|
| 16 |
+
input_mime_types = ["application/pdf"]
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
super(UnstructuredIOExtractor, self).__init__()
|
| 20 |
+
|
| 21 |
+
def extract(self, content: Content, params: UnstructuredIOConfig) -> List[Union[Feature, Content]]:
|
| 22 |
+
contents = []
|
| 23 |
+
strategy = params.strategy
|
| 24 |
+
hi_res_model_name = params.hi_res_model_name
|
| 25 |
+
infer_table_structure = params.infer_table_structure
|
| 26 |
+
|
| 27 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
|
| 28 |
+
inputtmpfile.write(content.data)
|
| 29 |
+
inputtmpfile.flush()
|
| 30 |
+
|
| 31 |
+
elements = partition_pdf(inputtmpfile.name, strategy=strategy, hi_res_model_name=hi_res_model_name, infer_table_structure=infer_table_structure)
|
| 32 |
+
for el in elements:
|
| 33 |
+
feature = Feature.metadata(value={"type": type(el).__name__, "page_number": el.metadata.page_number})
|
| 34 |
+
contents.append(Content.from_text(el.text, features=[feature]))
|
| 35 |
+
|
| 36 |
+
return contents
|
| 37 |
+
|
| 38 |
+
def sample_input(self) -> Content:
|
| 39 |
+
return self.sample_scientific_pdf()
|