Spaces:

Corin1998
/

HFResumeIntakeSystem

Runtime error

App Files Files Community

Corin1998 commited on Nov 23, 2025

Commit

4328220

verified ·

1 Parent(s): 1149a64

Create openai_ingest.py

Browse files

Files changed (1) hide show

pipelines/openai_ingest.py +125 -0

pipelines/openai_ingest.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import io
+import base64
+from typing import List
+from pdf2image import convert_from_bytes
+from PIL import Image
+from openai import OpenAI
+MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
+MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
+_client = None
+def _client_lazy():
+    global _client
+    if _client is None:
+        _client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    return _client
+def _img_to_base64(img: Image.Image) -> str:
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]:
+    pages = convert_from_bytes(pdf_bytes, dpi=dpi)
+    return pages[:max_pages]
+def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
+    """画像/PDFは画像化してVisionに渡す。テキストは整形依頼してきれいな本文を返す。"""
+    client = _client_lazy()
+    # 画像群を構築
+    images: List[Image.Image] = []
+    if filetype == "pdf":
+        images = _pdf_to_images(payload)
+    elif filetype == "image":
+        images = [Image.open(io.BytesIO(payload)).convert("RGB")]
+    else:  # txt/docxから来たテキストbytes
+        text = payload.decode("utf-8", errors="ignore")
+        prompt = (
+            "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
+        )
+        resp = client.responses.create(
+            model=MODEL_TEXT,
+            input=[
+                {"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."},
+                {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
+            ],
+        )
+        return resp.output_text
+    content = [
+        {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
+    ]
+    for img in images:
+        content.append({
+            "type": "input_image",
+            "image_data": _img_to_base64(img),
+        })
+    resp = client.responses.create(
+        model=MODEL_VISION,
+        input=[{"role": "user", "content": content}],
+    )
+    return resp.output_text
+def structure_with_openai(text: str) -> dict:
+    client = _client_lazy()
+    sys = (
+        "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。"
+        " JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
+        " skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。"
+    )
+    user = (
+        "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
+    )
+    resp = client.responses.create(
+        model=MODEL_TEXT,
+        input=[
+            {"role": "system", "content": [{"type": "input_text", "text": sys}]},
+            {"role": "user", "content": [{"type": "input_text", "text": user}]},
+        ],
+        response_format={"type": "json_object"},
+    )
+    import json as _json
+    try:
+        data = _json.loads(resp.output_text)
+    except Exception:
+        data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
+    for k in ("work_experience_raw", "education_raw", "certifications_raw"):
+        data.setdefault(k, "")
+    data.setdefault("skills_list", [])
+    return data
+def summarize_with_openai(text: str) -> dict:
+    client = _client_lazy()
+    prompt = (
+        "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。余計な記号は避け、事実を簡潔に。"
+    )
+    resp = client.responses.create(
+        model=MODEL_TEXT,
+        input=[
+            {"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]},
+            {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
+        ],
+    )
+    full = resp.output_text
+    # 簡易パース（区切り語で抽出）。失敗時は同文を複写
+    def _slice(full_txt, marker, fallback):
+        import re
+        m = re.search(marker + r"[\s\S]*?\n", full_txt)
+        return (m.group(0).split("\n")[0] if m else fallback).strip()
+    return {
+        "300chars": full[:300*2] if len(full) > 0 else "",
+        "100chars": full[:120] if len(full) > 0 else "",
+        "onesent": full.split("。")[0] + "。" if "。" in full else full,
+    }