Corin1998 commited on
Commit
4328220
·
verified ·
1 Parent(s): 1149a64

Create openai_ingest.py

Browse files
Files changed (1) hide show
  1. pipelines/openai_ingest.py +125 -0
pipelines/openai_ingest.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ from typing import List
5
+ from pdf2image import convert_from_bytes
6
+ from PIL import Image
7
+ from openai import OpenAI
8
+
9
+ MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini")
10
+ MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini")
11
+
12
+ _client = None
13
+
14
+ def _client_lazy():
15
+ global _client
16
+ if _client is None:
17
+ _client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
18
+ return _client
19
+
20
+
21
+ def _img_to_base64(img: Image.Image) -> str:
22
+ buf = io.BytesIO()
23
+ img.save(buf, format="PNG")
24
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
25
+
26
+
27
+ def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]:
28
+ pages = convert_from_bytes(pdf_bytes, dpi=dpi)
29
+ return pages[:max_pages]
30
+
31
+
32
+ def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
33
+ """画像/PDFは画像化してVisionに渡す。テキストは整形依頼してきれいな本文を返す。"""
34
+ client = _client_lazy()
35
+
36
+ # 画像群を構築
37
+ images: List[Image.Image] = []
38
+ if filetype == "pdf":
39
+ images = _pdf_to_images(payload)
40
+ elif filetype == "image":
41
+ images = [Image.open(io.BytesIO(payload)).convert("RGB")]
42
+ else: # txt/docxから来たテキストbytes
43
+ text = payload.decode("utf-8", errors="ignore")
44
+ prompt = (
45
+ "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
46
+ )
47
+ resp = client.responses.create(
48
+ model=MODEL_TEXT,
49
+ input=[
50
+ {"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."},
51
+ {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
52
+ ],
53
+ )
54
+ return resp.output_text
55
+
56
+ content = [
57
+ {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
58
+ ]
59
+ for img in images:
60
+ content.append({
61
+ "type": "input_image",
62
+ "image_data": _img_to_base64(img),
63
+ })
64
+
65
+ resp = client.responses.create(
66
+ model=MODEL_VISION,
67
+ input=[{"role": "user", "content": content}],
68
+ )
69
+ return resp.output_text
70
+
71
+
72
+ def structure_with_openai(text: str) -> dict:
73
+ client = _client_lazy()
74
+ sys = (
75
+ "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。"
76
+ " JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。"
77
+ " skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。"
78
+ )
79
+ user = (
80
+ "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text
81
+ )
82
+ resp = client.responses.create(
83
+ model=MODEL_TEXT,
84
+ input=[
85
+ {"role": "system", "content": [{"type": "input_text", "text": sys}]},
86
+ {"role": "user", "content": [{"type": "input_text", "text": user}]},
87
+ ],
88
+ response_format={"type": "json_object"},
89
+ )
90
+ import json as _json
91
+ try:
92
+ data = _json.loads(resp.output_text)
93
+ except Exception:
94
+ data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []}
95
+ for k in ("work_experience_raw", "education_raw", "certifications_raw"):
96
+ data.setdefault(k, "")
97
+ data.setdefault("skills_list", [])
98
+ return data
99
+
100
+
101
+ def summarize_with_openai(text: str) -> dict:
102
+ client = _client_lazy()
103
+ prompt = (
104
+ "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。余計な記号は避け、事実を簡潔に。"
105
+ )
106
+ resp = client.responses.create(
107
+ model=MODEL_TEXT,
108
+ input=[
109
+ {"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]},
110
+ {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]},
111
+ ],
112
+ )
113
+ full = resp.output_text
114
+
115
+ # 簡易パース(区切り語で抽出)。失敗時は同文を複写
116
+ def _slice(full_txt, marker, fallback):
117
+ import re
118
+ m = re.search(marker + r"[\s\S]*?\n", full_txt)
119
+ return (m.group(0).split("\n")[0] if m else fallback).strip()
120
+
121
+ return {
122
+ "300chars": full[:300*2] if len(full) > 0 else "",
123
+ "100chars": full[:120] if len(full) > 0 else "",
124
+ "onesent": full.split("。")[0] + "。" if "。" in full else full,
125
+ }