verolabz / unstract_utils.py
omgy's picture
Create unstract_utils.py
a478c80 verified
raw
history blame contribute delete
981 Bytes
import os
import requests
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
UNSTRACT_BASE_URL = "https://api.unstract.com/v1/layout-extraction"
def extract_layout_text(file_path: str):
"""
Extracts both text and layout data from a document using the Unstract API.
Works for PDFs, DOCX, and other supported formats.
"""
if not UNSTRACT_API_KEY:
raise ValueError("Missing UNSTRACT_API_KEY. Please set it in Hugging Face Secrets.")
headers = {
"accept": "application/json",
"x-api-key": UNSTRACT_API_KEY
}
with open(file_path, "rb") as f:
files = {"file": f}
response = requests.post(UNSTRACT_BASE_URL, headers=headers, files=files)
if response.status_code != 200:
raise Exception(f"Unstract API error: {response.status_code}, {response.text}")
data = response.json()
extracted_text = data.get("text", "")
layout_data = data.get("layout", {})
return extracted_text, layout_data