| import os |
| import requests |
|
|
| UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") |
| UNSTRACT_BASE_URL = "https://api.unstract.com/v1/layout-extraction" |
|
|
| def extract_layout_text(file_path: str): |
| """ |
| Extracts both text and layout data from a document using the Unstract API. |
| Works for PDFs, DOCX, and other supported formats. |
| """ |
| if not UNSTRACT_API_KEY: |
| raise ValueError("Missing UNSTRACT_API_KEY. Please set it in Hugging Face Secrets.") |
|
|
| headers = { |
| "accept": "application/json", |
| "x-api-key": UNSTRACT_API_KEY |
| } |
|
|
| with open(file_path, "rb") as f: |
| files = {"file": f} |
| response = requests.post(UNSTRACT_BASE_URL, headers=headers, files=files) |
|
|
| if response.status_code != 200: |
| raise Exception(f"Unstract API error: {response.status_code}, {response.text}") |
|
|
| data = response.json() |
| extracted_text = data.get("text", "") |
| layout_data = data.get("layout", {}) |
|
|
| return extracted_text, layout_data |
|
|