Update pipelines/openai_ingest.py
Browse files- pipelines/openai_ingest.py +7 -16
pipelines/openai_ingest.py
CHANGED
|
@@ -30,7 +30,6 @@ def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> Lis
|
|
| 30 |
|
| 31 |
|
| 32 |
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
|
| 33 |
-
"""画像/PDFは画像化してVisionに渡す。テキストは整形依頼してきれいな本文を返す。"""
|
| 34 |
client = _client_lazy()
|
| 35 |
|
| 36 |
images: List[Image.Image] = []
|
|
@@ -38,11 +37,9 @@ def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> st
|
|
| 38 |
images = _pdf_to_images(payload)
|
| 39 |
elif filetype == "image":
|
| 40 |
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
|
| 41 |
-
else:
|
| 42 |
text = payload.decode("utf-8", errors="ignore")
|
| 43 |
-
prompt =
|
| 44 |
-
"以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
|
| 45 |
-
)
|
| 46 |
resp = client.responses.create(
|
| 47 |
model=MODEL_TEXT,
|
| 48 |
input=[
|
|
@@ -56,15 +53,9 @@ def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> st
|
|
| 56 |
{"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
|
| 57 |
]
|
| 58 |
for img in images:
|
| 59 |
-
content.append({
|
| 60 |
-
"type": "input_image",
|
| 61 |
-
"image_data": _img_to_base64(img),
|
| 62 |
-
})
|
| 63 |
|
| 64 |
-
resp = client.responses.create(
|
| 65 |
-
model=MODEL_VISION,
|
| 66 |
-
input=[{"role": "user", "content": content}],
|
| 67 |
-
)
|
| 68 |
return resp.output_text
|
| 69 |
|
| 70 |
|
|
@@ -107,7 +98,7 @@ def summarize_with_openai(text: str) -> dict:
|
|
| 107 |
)
|
| 108 |
full = resp.output_text
|
| 109 |
return {
|
| 110 |
-
"300chars": full[:600] if
|
| 111 |
-
"100chars": full[:120] if
|
| 112 |
-
"onesent": full.split("。")[0] + "。" if "。" in full else full,
|
| 113 |
}
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
|
|
|
|
| 33 |
client = _client_lazy()
|
| 34 |
|
| 35 |
images: List[Image.Image] = []
|
|
|
|
| 37 |
images = _pdf_to_images(payload)
|
| 38 |
elif filetype == "image":
|
| 39 |
images = [Image.open(io.BytesIO(payload)).convert("RGB")]
|
| 40 |
+
else: # txt/docxから来たテキストbytes
|
| 41 |
text = payload.decode("utf-8", errors="ignore")
|
| 42 |
+
prompt = "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
|
|
|
|
|
|
|
| 43 |
resp = client.responses.create(
|
| 44 |
model=MODEL_TEXT,
|
| 45 |
input=[
|
|
|
|
| 53 |
{"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
|
| 54 |
]
|
| 55 |
for img in images:
|
| 56 |
+
content.append({"type": "input_image", "image_data": _img_to_base64(img)})
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
resp = client.responses.create(model=MODEL_VISION, input=[{"role": "user", "content": content}])
|
|
|
|
|
|
|
|
|
|
| 59 |
return resp.output_text
|
| 60 |
|
| 61 |
|
|
|
|
| 98 |
)
|
| 99 |
full = resp.output_text
|
| 100 |
return {
|
| 101 |
+
"300chars": full[:600] if full else "",
|
| 102 |
+
"100chars": full[:120] if full else "",
|
| 103 |
+
"onesent": (full.split("。")[0] + "。") if ("。" in full) else full,
|
| 104 |
}
|