pdf-extractor / extractor.py
github-actions[bot]
Sync from GitHub
229a366
import os
import json
import base64
import io
from typing import Dict, Any, List, Tuple, Optional
from openai import OpenAI
import pypdfium2 as pdfium
# path to templates folder (relative to this file)
TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
TEMPLATE_REGISTRY: Dict[str, Dict[str, str]] = {
# keyword in PDF filename (lowercase) : { document_type, template_file }
# Immigration forms
"i129": {
"document_type": "USCIS Form I-129 H-1B Petition",
"template_file": "i129_h1b_petition.json",
},
"i94": {
"document_type": "Form I-94 Arrival/Departure Record",
"template_file": "i_94.json",
},
"i-94": {
"document_type": "Form I-94 Arrival/Departure Record",
"template_file": "i_94.json",
},
"i20": {
"document_type": "Form I-20 Certificate of Eligibility",
"template_file": "proof_of_in_country_status.json",
},
"i-20": {
"document_type": "Form I-20 Certificate of Eligibility",
"template_file": "proof_of_in_country_status.json",
},
# Identity documents
"passport": {
"document_type": "Passport",
"template_file": "passport.json",
},
"visa": {
"document_type": "US Visa",
"template_file": "us_visa.json",
},
# Education documents
"transcript": {
"document_type": "Academic Transcript",
"template_file": "school_transcripts.json",
},
"diploma": {
"document_type": "Diploma",
"template_file": "diplomas.json",
},
# Employment documents
"employment letter": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"offer letter": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"offer-letter": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"offer_letter": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"employment_letter": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"employment": {
"document_type": "Employment Letter",
"template_file": "employment_letter.json",
},
"resume": {
"document_type": "Resume/CV",
"template_file": "resume.json",
},
"cv": {
"document_type": "Resume/CV",
"template_file": "resume.json",
},
# Tax and corporate documents
"fein": {
"document_type": "Corporate Tax Returns",
"template_file": "corporate_tax_returns.json",
},
"cp575": {
"document_type": "Corporate Tax Returns",
"template_file": "corporate_tax_returns.json",
},
"tax": {
"document_type": "Corporate Tax Returns",
"template_file": "corporate_tax_returns.json",
},
# Personal documents
"marriage": {
"document_type": "Marriage Certificate",
"template_file": "marriage_certificate.json",
},
"marriage_certificate": {
"document_type": "Marriage Certificate",
"template_file": "marriage_certificate.json",
},
# Proof of status
"proof": {
"document_type": "Proof of In-Country Status",
"template_file": "proof_of_in_country_status.json",
},
}
# Logical model aliases for this extractor (OpenAI ChatGPT models).
ALLOWED_MODELS = [
"default",
"gpt-4.1-mini",
"gpt-4.1",
"gpt-4o-mini",
"gpt-4o",
# Legacy/dated aliases kept for compatibility.
"gpt-4.1-2025-04-14",
"gpt-4.1-mini-2025-04-14",
"gpt-5-2025-08-07",
"gpt-5-mini-2025-08-07",
]
DEFAULT_MODEL = os.getenv("EXTRACTOR_MODEL_ALIAS", "gpt-4.1-mini")
OPENAI_API_KEY_ENV = "OPENAI_API_KEY"
_openai_client: Optional[OpenAI] = None
def load_template(template_file: str) -> Dict[str, Any]:
path = os.path.join(TEMPLATES_DIR, template_file)
if not os.path.exists(path):
raise FileNotFoundError(f"Template not found: {path}")
with open(path, "r", encoding="utf-8") as fh:
return json.load(fh)
def infer_template_from_filename(filename: str) -> Tuple[str, Dict[str, Any]]:
"""
Look at the PDF file name and decide which document_type + template to use.
Example:
- 'I129 HALF.pdf' -> matches 'i129' -> uses i129_h1b_petition.json
- 'passport_rohan.pdf' -> matches 'passport' -> uses passport.json
- 'F1_visa_page1.pdf' -> matches 'visa' -> uses us_visa.json
- 'i94_record.pdf' -> matches 'i94' -> uses i_94.json
"""
basename = os.path.basename(filename).lower()
for keyword, cfg in TEMPLATE_REGISTRY.items():
if keyword in basename:
document_type = cfg["document_type"]
template = load_template(cfg["template_file"])
return document_type, template
# fallback: raise to force user to add mapping or rename file
raise ValueError(
f"Could not infer document type from filename '{basename}'. "
f"Known keywords: {list(TEMPLATE_REGISTRY.keys())}"
)
def pdf_bytes_to_base64_images(pdf_bytes: bytes, max_pages: int = 10) -> List[str]:
"""
Render each page of the PDF bytes to a JPEG image and return a list of
base64-encoded image strings (no data URL prefix). Limit pages by max_pages.
"""
pdf = pdfium.PdfDocument(pdf_bytes)
images: List[str] = []
try:
total_pages = len(pdf)
if max_pages is not None and max_pages > 0:
page_count = min(total_pages, max_pages)
else:
page_count = total_pages
# Adaptive scale/quality to keep payloads manageable.
if page_count <= 2:
scale = 4.17 # ~300 DPI
quality = 80
elif page_count <= 10:
scale = 2.0 # ~145 DPI
quality = 60
else:
scale = 1.5 # ~110 DPI
quality = 60
for page_index in range(page_count):
page = pdf[page_index]
pil_image = page.render(scale=scale).to_pil()
buffered = io.BytesIO()
pil_image.save(buffered, format="JPEG", quality=quality)
img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
images.append(img_b64)
buffered.close()
pil_image.close()
finally:
pdf.close()
return images
def build_extraction_prompt(document_type: str, template: Dict[str, Any]) -> str:
"""
Build a prompt that instructs the model to extract data into the
exact JSON structure defined by the template.
"""
return f"""
You are a document data extraction system.
Document Type: {document_type}
Extract all information from the provided document image(s) and return it in the following exact JSON structure:
{json.dumps(template, indent=2)}
Instructions:
- Output only valid JSON matching exactly the structure above
- Do NOT add explanations
- Do NOT wrap the JSON in markdown, backticks, or code fences
- If a field is missing, set it to ""
- Use the exact field names; do not modify the structure
- Extract information from ALL pages
"""
def _get_openai_client() -> OpenAI:
global _openai_client
if _openai_client is None:
api_key = os.getenv(OPENAI_API_KEY_ENV)
if not api_key:
raise RuntimeError(
f"{OPENAI_API_KEY_ENV} is not set. "
"Set it in your environment or CI secrets."
)
_openai_client = OpenAI(api_key=api_key)
return _openai_client
def _extract_text_from_response(response: Any) -> str:
output_text = getattr(response, "output_text", None)
if isinstance(output_text, str) and output_text.strip():
return output_text.strip()
output = getattr(response, "output", None)
if isinstance(output, list):
parts: List[str] = []
for item in output:
content = getattr(item, "content", None)
if content is None and isinstance(item, dict):
content = item.get("content")
if isinstance(content, list):
for block in content:
if isinstance(block, dict):
block_type = block.get("type")
if block_type in ("output_text", "text"):
parts.append(block.get("text", ""))
else:
block_type = getattr(block, "type", None)
if block_type in ("output_text", "text"):
parts.append(getattr(block, "text", ""))
return "".join(parts).strip()
return ""
def _invoke_openai(prompt: str, images: List[str], model: str) -> Any:
"""
Call OpenAI ChatGPT with the given prompt + images and return the response.
"""
client = _get_openai_client()
user_content: List[Dict[str, Any]] = [
{"type": "input_text", "text": prompt},
]
for img_b64 in images:
user_content.append(
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{img_b64}",
}
)
return client.responses.create(
model=model,
temperature=0,
input=[
{
"role": "system",
"content": [
{
"type": "input_text",
"text": "You are a precise document extraction engine.",
}
],
},
{
"role": "user",
"content": user_content,
},
],
)
def call_openai_extract(
document_type: str,
template: Dict[str, Any],
images: List[str],
model: str = DEFAULT_MODEL,
) -> Dict[str, Any]:
"""
Call OpenAI ChatGPT to extract structured JSON for the given
document type and template.
"""
resolved_model = DEFAULT_MODEL if model == "default" else model
if resolved_model not in ALLOWED_MODELS:
raise ValueError(
f"Unsupported model alias '{model}'. "
f"Supported values: {ALLOWED_MODELS}. "
"This extractor uses OpenAI ChatGPT models."
)
prompt = build_extraction_prompt(document_type, template)
response = _invoke_openai(prompt, images, resolved_model)
json_str = _extract_text_from_response(response).strip()
# Strip optional markdown fences (```json ... ```)
if json_str.startswith("```"):
lines = json_str.splitlines()
if lines and lines[0].lstrip().startswith("```"):
lines = lines[1:]
if lines and lines[-1].strip().startswith("```"):
lines = lines[:-1]
json_str = "\n".join(lines).strip()
if not json_str:
raise ValueError(
"Model response did not contain any text content to parse as JSON."
)
try:
return json.loads(json_str)
except json.JSONDecodeError as exc:
snippet = json_str[:500]
raise ValueError(
f"Model output was not valid JSON: {exc}. "
f"First 500 characters of response: {snippet!r}"
) from exc
def extract_using_openai_from_pdf_bytes(
pdf_bytes: bytes,
filename: str,
max_pages: int = 10,
model: str = DEFAULT_MODEL,
) -> Dict[str, Any]:
"""
Backwards-compatible entrypoint used by the Vision Lambda.
Despite the legacy name, this now uses OpenAI ChatGPT to perform the
extraction while preserving the JSON contract.
"""
document_type, template = infer_template_from_filename(filename)
images = pdf_bytes_to_base64_images(pdf_bytes, max_pages=max_pages)
if not images:
raise RuntimeError("No images were extracted from PDF")
return call_openai_extract(document_type, template, images, model=model)
def _prompt_for_pdf_path() -> str:
"""
Simple CLI helper for local runs. Web UI integrations can call
extract_using_openai_from_pdf_bytes directly instead.
"""
path = input("Enter path to PDF: ").strip()
if not path:
raise SystemExit("No PDF path provided.")
return path
if __name__ == "__main__":
pdf_path = _prompt_for_pdf_path()
with open(pdf_path, "rb") as fh:
pdf_data = fh.read()
result = extract_using_openai_from_pdf_bytes(pdf_data, pdf_path)
print(json.dumps(result, ensure_ascii=False))