Corin1998 commited on
Commit
3447c9a
·
verified ·
1 Parent(s): e199664

Update pipelines/openai_ingest.py

Browse files
Files changed (1) hide show
  1. pipelines/openai_ingest.py +7 -16
pipelines/openai_ingest.py CHANGED
@@ -30,7 +30,6 @@ def _pdf_to_images(pdf_bytes: bytes, dpi: int = 220, max_pages: int = 10) -> Lis
30
 
31
 
32
  def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
33
- """画像/PDFは画像化してVisionに渡す。テキストは整形依頼してきれいな本文を返す。"""
34
  client = _client_lazy()
35
 
36
  images: List[Image.Image] = []
@@ -38,11 +37,9 @@ def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> st
38
  images = _pdf_to_images(payload)
39
  elif filetype == "image":
40
  images = [Image.open(io.BytesIO(payload)).convert("RGB")]
41
- else:
42
  text = payload.decode("utf-8", errors="ignore")
43
- prompt = (
44
- "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
45
- )
46
  resp = client.responses.create(
47
  model=MODEL_TEXT,
48
  input=[
@@ -56,15 +53,9 @@ def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> st
56
  {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
57
  ]
58
  for img in images:
59
- content.append({
60
- "type": "input_image",
61
- "image_data": _img_to_base64(img),
62
- })
63
 
64
- resp = client.responses.create(
65
- model=MODEL_VISION,
66
- input=[{"role": "user", "content": content}],
67
- )
68
  return resp.output_text
69
 
70
 
@@ -107,7 +98,7 @@ def summarize_with_openai(text: str) -> dict:
107
  )
108
  full = resp.output_text
109
  return {
110
- "300chars": full[:600] if len(full) > 0 else "",
111
- "100chars": full[:120] if len(full) > 0 else "",
112
- "onesent": full.split("。")[0] + "。" if "。" in full else full,
113
  }
 
30
 
31
 
32
  def extract_text_with_openai(payload: bytes, filename: str, filetype: str) -> str:
 
33
  client = _client_lazy()
34
 
35
  images: List[Image.Image] = []
 
37
  images = _pdf_to_images(payload)
38
  elif filetype == "image":
39
  images = [Image.open(io.BytesIO(payload)).convert("RGB")]
40
+ else: # txt/docxから来たテキストbytes
41
  text = payload.decode("utf-8", errors="ignore")
42
+ prompt = "以下は履歴書/職務経歴書の本文です。レイアウトノイズを除去し、見出しや箇条書きを維持しつつ読みやすいテキストに整形して返してください。"
 
 
43
  resp = client.responses.create(
44
  model=MODEL_TEXT,
45
  input=[
 
53
  {"type": "input_text", "text": "日本語の履歴書/職務経歴書の画像です。OCRして本文を日本語テキストで忠実に返してください。"}
54
  ]
55
  for img in images:
56
+ content.append({"type": "input_image", "image_data": _img_to_base64(img)})
 
 
 
57
 
58
+ resp = client.responses.create(model=MODEL_VISION, input=[{"role": "user", "content": content}])
 
 
 
59
  return resp.output_text
60
 
61
 
 
98
  )
99
  full = resp.output_text
100
  return {
101
+ "300chars": full[:600] if full else "",
102
+ "100chars": full[:120] if full else "",
103
+ "onesent": (full.split("。")[0] + "。") if ("。" in full) else full,
104
  }