import os import io import base64 from typing import List from pdf2image import convert_from_bytes from PIL import Image from openai import OpenAI MODEL_VISION = os.environ.get("OPENAI_VISION_MODEL", "gpt-4o-mini") MODEL_TEXT = os.environ.get("OPENAI_TEXT_MODEL", "gpt-4o-mini") _client = None def _client_lazy(): global _client if _client is None: key = os.environ.get("OPENAI_API_KEY") or os.environ.get("OPENAI_APIKEY") # ← フォールバック if not key: raise RuntimeError("OPENAI_API_KEY (または OPENAI_APIKEY) が未設定です。SpacesのSecretsに追加してください。") _client = OpenAI(api_key=key) return _client def _img_to_base64(img: Image.Image) -> str: buf = io.BytesIO() img.save(buf, format="PNG") return base64.b64encode(buf.getvalue()).decode("utf-8") def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> List[Image.Image]: pages = convert_from_bytes(pdf_bytes, dpi=dpi) return pages[:max_pages] def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str: client = _client_lazy() images: List[Image.Image] = [] if filetype == "pdf": images = _pdf_to_images(payload) elif filetype == "image": images = [Image.open(io.BytesIO(payload)).convert("RGB")] else: text = payload.decode("utf-8", errors="ignore") prompt = "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。" resp = client.responses.create( model=MODEL_TEXT, input=[ {"role": "system", "content": "You are a meticulous document cleaner for Japanese resumes."}, {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]}, ], ) return resp.output_text content = [ {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"} ] for img in images: content.append({"type": "input_image", "image_data": _img_to_base64(img)}) resp = client.responses.create(model=MODEL_VISION, input=[{"role": "user", "content": content}]) return resp.output_text def structure_with_openai(text: str) -> dict: client = _client_lazy() sys = ( "あなたは日本語レジュメの構造化アシスタントです。入力テキストからセクションを抽出し、JSONで返してください。" " JSONキー: work_experience_raw, education_raw, certifications_raw, skills_list。" " skills_list は重複除去済み配列。work_experience_raw等は原文抜粋で良い。" ) user = "以下のテキストを解析し、指定のJSONキーで返してください。\n\n" + text resp = client.responses.create( model=MODEL_TEXT, input=[ {"role": "system", "content": [{"type": "input_text", "text": sys}]}, {"role": "user", "content": [{"type": "input_text", "text": user}]}, ], response_format={"type": "json_object"}, ) import json as _json try: data = _json.loads(resp.output_text) except Exception: data = {"work_experience_raw": text, "education_raw": "", "certifications_raw": "", "skills_list": []} for k in ("work_experience_raw", "education_raw", "certifications_raw"): data.setdefault(k, "") data.setdefault("skills_list", []) return data def summarize_with_openai(text: str) -> dict: client = _client_lazy() prompt = "以下の候補者レジュメ本文を、(1)300字、(2)100字、(3)1文 の3粒度で日本語要約してください。余計な記号は避け、事実を簡潔に。" resp = client.responses.create( model=MODEL_TEXT, input=[ {"role": "system", "content": [{"type": "input_text", "text": "You write crisp Japanese executive summaries."}]}, {"role": "user", "content": [{"type": "input_text", "text": prompt + "\n\n" + text}]}, ], ) full = resp.output_text return { "300chars": full[:600] if len(full) > 0 else "", "100chars": full[:120] if len(full) > 0 else "", "onesent": full.split("。")[0] + "。" if "。" in full else full, }