Corin1998 commited on
Commit
bc98150
·
verified ·
1 Parent(s): e8bbd9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -48
app.py CHANGED
@@ -2,8 +2,8 @@ import os
2
  import io
3
  import json
4
  import hashlib
5
- import pathlib
6
  import gradio as gr
 
7
 
8
  from pipelines.openai_ingest import (
9
  extract_text_with_openai,
@@ -20,21 +20,33 @@ from pipelines.utils import detect_filetype, load_doc_text
20
 
21
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
22
 
 
23
 
24
- def _read_file_obj_or_path(f):
25
- """gr.Files v4 用の堅牢リーダ。
26
- - type="filepath" の場合: f は str/Path
27
- - type="binary" の場合: f は UploadedFile ライク(.name/.read())
28
- 戻り値: (filename, bytes)
29
  """
30
- if isinstance(f, (str, pathlib.Path)):
31
- p = pathlib.Path(f)
32
- return p.name, p.read_bytes()
33
- # UploadedFile 互換
34
- return getattr(f, "name", "uploaded"), f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
 
36
 
37
- def process_resumes(files, candidate_id: str, additional_notes: str = ""):
38
  if not files:
39
  raise gr.Error("少なくとも1ファイルをアップロードしてください。")
40
 
@@ -42,29 +54,20 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
42
  raw_texts = []
43
 
44
  for f in files:
45
- fname, raw_bytes = _read_file_obj_or_path(f)
46
  filetype = detect_filetype(fname, raw_bytes)
47
 
48
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
49
  if filetype in {"pdf", "image"}:
50
  text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
51
- elif filetype in {"docx", "txt"}:
52
- base_text = load_doc_text(filetype, raw_bytes)
53
- # 生テキストをOpenAIへ渡し、整形した全文を返す
54
- text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
55
  else:
56
- # フォールバック:そのままテキスト化を試み、ダメならエラーを添えて続行
57
- try:
58
- base_text = raw_bytes.decode("utf-8", errors="ignore")
59
- except Exception:
60
- base_text = ""
61
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
62
 
63
  raw_texts.append({"filename": fname, "text": text})
64
 
65
- # 2) OpenAIでセクション構造化
66
  structured = structure_with_openai(text)
67
- # 念のためルールベース正規化も適用(期間抽出など補助)
68
  normalized = normalize_resume({
69
  "work_experience": structured.get("work_experience_raw", ""),
70
  "education": structured.get("education_raw", ""),
@@ -78,10 +81,10 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
78
  "normalized": normalized,
79
  })
80
 
81
- # 3) 統合(複数ファイル→1候補者)
82
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
83
 
84
- # 4) スキル抽出(辞書/正規表現)
85
  merged_text = "\n\n".join([r["text"] for r in partial_records])
86
  skills = extract_skills(merged_text, {
87
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
@@ -97,14 +100,13 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
97
  # 6) 品質スコア
98
  score = compute_quality_score(merged_text, merged)
99
 
100
- # 7) 要約(300/100/1文)
101
  summaries = summarize_with_openai(merged_text)
102
 
103
  # 8) 構造化出力
104
- candidate_id_final = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
105
  result_json = {
106
- "candidate_id": candidate_id_final,
107
- "files": [r["source"] for r in partial_records],
108
  "merged": merged,
109
  "skills": skills,
110
  "quality_score": score,
@@ -127,22 +129,24 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
127
  pdf_path=f"candidates/{file_hash}.anon.pdf",
128
  )
129
 
130
- anon_pdf = (candidate_id_final + ".anon.pdf", anon_pdf_bytes)
131
 
132
- # 重要: gr.JSON を避けるため JSON文字列で返す
133
  return (
134
- json.dumps(result_json, ensure_ascii=False, indent=2),
135
- json.dumps(skills, ensure_ascii=False, indent=2),
136
- json.dumps(score, ensure_ascii=False, indent=2),
137
  summaries.get("300chars", ""),
138
  summaries.get("100chars", ""),
139
  summaries.get("onesent", ""),
140
  anon_pdf,
141
- json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
142
  )
143
 
144
 
145
- with gr.Blocks(title=APP_TITLE) as demo:
 
 
146
  gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
147
 
148
  with gr.Row():
@@ -150,7 +154,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
150
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
151
  file_count="multiple",
152
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
153
- type="filepath", # v4仕様: 'file' は無効
154
  )
155
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
156
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
@@ -161,11 +165,11 @@ with gr.Blocks(title=APP_TITLE) as demo:
161
  out_json = gr.Code(label="統合出力 (JSON)")
162
 
163
  with gr.Tab("抽出スキル"):
164
- # gr.JSON は Gradio v4 API 情報生成で例外を起こすケースがあるため避ける
165
- out_skills = gr.Code(label="スキル一覧(JSON)")
166
 
167
  with gr.Tab("品質スコア"):
168
- out_score = gr.Code(label="品質評価(JSON)")
169
 
170
  with gr.Tab("要約 (300/100/1文)"):
171
  out_sum_300 = gr.Textbox(label="300字要約")
@@ -176,7 +180,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
176
  out_pdf = gr.File(label="匿名PDFダウンロード")
177
 
178
  with gr.Tab("Datasets 保存ログ"):
179
- out_commit = gr.Code(label="コミット情報")
180
 
181
  run_btn.click(
182
  process_resumes,
@@ -186,9 +190,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
186
 
187
 
188
  if __name__ == "__main__":
189
- try:
190
- # 通常はローカル公開で起動
191
- demo.launch(server_name="0.0.0.0", server_port=7860)
192
- except ValueError:
193
- # 実行環境が localhost にアクセスできない場合のフォールバック
194
- demo.launch(share=True)
 
2
  import io
3
  import json
4
  import hashlib
 
5
  import gradio as gr
6
+ from typing import Tuple, List, Union
7
 
8
  from pipelines.openai_ingest import (
9
  extract_text_with_openai,
 
20
 
21
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
22
 
23
+ # ---- helpers ---------------------------------------------------------------
24
 
25
+ def _read_file_input(item: Union[str, "gradio.files.TempFile"]) -> Tuple[bytes, str]:
 
 
 
 
26
  """
27
+ Gradio v4.44 の Files(type='filepath') str パスを返す。
28
+ 互換のため、パス/ファイルライク双方を許容して (bytes, filename) を返す。
29
+ """
30
+ if isinstance(item, str):
31
+ with open(item, "rb") as rf:
32
+ data = rf.read()
33
+ name = os.path.basename(item)
34
+ return data, name
35
+ # UploadedFile 等(念のため)
36
+ if hasattr(item, "read"):
37
+ data = item.read()
38
+ name = getattr(item, "name", "uploaded")
39
+ return data, os.path.basename(name)
40
+ raise ValueError("Unsupported file input type")
41
+
42
+
43
+ def _as_json_code(obj) -> str:
44
+ return json.dumps(obj, ensure_ascii=False, indent=2)
45
+
46
 
47
+ # ---- core pipeline ---------------------------------------------------------
48
 
49
+ def process_resumes(files: List[Union[str, "gradio.files.TempFile"]], candidate_id: str, additional_notes: str = ""):
50
  if not files:
51
  raise gr.Error("少なくとも1ファイルをアップロードしてください。")
52
 
 
54
  raw_texts = []
55
 
56
  for f in files:
57
+ raw_bytes, fname = _read_file_input(f)
58
  filetype = detect_filetype(fname, raw_bytes)
59
 
60
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
61
  if filetype in {"pdf", "image"}:
62
  text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
 
 
 
 
63
  else:
64
+ base_text = load_doc_text(filetype, raw_bytes)
 
 
 
 
65
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
66
 
67
  raw_texts.append({"filename": fname, "text": text})
68
 
69
+ # 2) OpenAIでセクション構造化 → ルール整形
70
  structured = structure_with_openai(text)
 
71
  normalized = normalize_resume({
72
  "work_experience": structured.get("work_experience_raw", ""),
73
  "education": structured.get("education_raw", ""),
 
81
  "normalized": normalized,
82
  })
83
 
84
+ # 3) 統合
85
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
86
 
87
+ # 4) スキル抽出
88
  merged_text = "\n\n".join([r["text"] for r in partial_records])
89
  skills = extract_skills(merged_text, {
90
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
 
100
  # 6) 品質スコア
101
  score = compute_quality_score(merged_text, merged)
102
 
103
+ # 7) 要約
104
  summaries = summarize_with_openai(merged_text)
105
 
106
  # 8) 構造化出力
 
107
  result_json = {
108
+ "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
109
+ "files": [os.path.basename(_read_file_input(f)[1]) if isinstance(f, str) else getattr(f, "name", "uploaded") for f in files],
110
  "merged": merged,
111
  "skills": skills,
112
  "quality_score": score,
 
129
  pdf_path=f"candidates/{file_hash}.anon.pdf",
130
  )
131
 
132
+ anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
133
 
134
+ # 返却はすべて文字列 or ファイル
135
  return (
136
+ _as_json_code(result_json),
137
+ _as_json_code(skills),
138
+ _as_json_code(score),
139
  summaries.get("300chars", ""),
140
  summaries.get("100chars", ""),
141
  summaries.get("onesent", ""),
142
  anon_pdf,
143
+ _as_json_code(commit_info or {"status": "skipped (DATASET_REPO not set)"}),
144
  )
145
 
146
 
147
+ # ---- UI --------------------------------------------------------------------
148
+
149
+ with gr.Blocks(title=APP_TITLE, analytics_enabled=False) as demo:
150
  gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
151
 
152
  with gr.Row():
 
154
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
155
  file_count="multiple",
156
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
157
+ type="filepath", # 重要: 'file' はNG
158
  )
159
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
160
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
 
165
  out_json = gr.Code(label="統合出力 (JSON)")
166
 
167
  with gr.Tab("抽出スキル"):
168
+ # gr.JSON は 4.44 でスキーマ事故発生 Code に置換
169
+ out_skills = gr.Code(label="スキル一覧 (JSON)")
170
 
171
  with gr.Tab("品質スコア"):
172
+ out_score = gr.Code(label="品質評価 (JSON)")
173
 
174
  with gr.Tab("要約 (300/100/1文)"):
175
  out_sum_300 = gr.Textbox(label="300字要約")
 
180
  out_pdf = gr.File(label="匿名PDFダウンロード")
181
 
182
  with gr.Tab("Datasets 保存ログ"):
183
+ out_commit = gr.Code(label="コミット情報 (JSON)")
184
 
185
  run_btn.click(
186
  process_resumes,
 
190
 
191
 
192
  if __name__ == "__main__":
193
+ # ローカル不可な環境では share=True を強制(環境変数で上書き可)
194
+ share_env = os.environ.get("GRADIO_SHARE", "true").lower()
195
+ share_flag = share_env in ("1", "true", "yes", "on")
196
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=share_flag)