Corin1998 commited on
Commit
0db7548
·
verified ·
1 Parent(s): 1c1a2e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -32
app.py CHANGED
@@ -19,6 +19,41 @@ from pipelines.utils import detect_filetype, load_doc_text
19
 
20
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def process_resumes(files, candidate_id: str, additional_notes: str = ""):
24
  if not files:
@@ -27,21 +62,19 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
27
  partial_records = []
28
  raw_texts = []
29
 
30
- for f in files:
31
- raw_bytes = f.read()
32
- filetype = detect_filetype(f.name, raw_bytes)
33
 
34
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
35
  if filetype in {"pdf", "image"}:
36
- text = extract_text_with_openai(raw_bytes, filename=f.name, filetype=filetype)
37
  else:
38
  base_text = load_doc_text(filetype, raw_bytes)
39
- # 生テキストをOpenAIへ渡し、整形本文を返す
40
- text = extract_text_with_openai(base_text.encode("utf-8"), filename=f.name, filetype="txt")
41
 
42
- raw_texts.append({"filename": f.name, "text": text})
43
 
44
- # 2) OpenAIでセクション構造化 → ルールベース正規化
45
  structured = structure_with_openai(text)
46
  normalized = normalize_resume({
47
  "work_experience": structured.get("work_experience_raw", ""),
@@ -50,7 +83,7 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
50
  "skills": ", ".join(structured.get("skills_list", [])),
51
  })
52
  partial_records.append({
53
- "source": f.name,
54
  "text": text,
55
  "structured": structured,
56
  "normalized": normalized,
@@ -68,20 +101,21 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
68
  "skills": ", ".join(merged.get("skills", [])),
69
  })
70
 
71
- # 5) 匿名化
72
  anonymized_text, anon_map = anonymize_text(merged_text)
73
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
74
 
75
  # 6) 品質スコア
76
  score = compute_quality_score(merged_text, merged)
77
 
78
- # 7) 要約(300/100/1文)
79
  summaries = summarize_with_openai(merged_text)
80
 
81
- # 8) 構造化出力(UIバグ回避のため **すべて文字列** で返す)
 
82
  result_json = {
83
- "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
84
- "files": [f.name for f in files],
85
  "merged": merged,
86
  "skills": skills,
87
  "quality_score": score,
@@ -90,29 +124,28 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
90
  "notes": additional_notes,
91
  }
92
 
93
- # 9) HF Datasets 保存
94
  dataset_repo = os.environ.get("DATASET_REPO")
95
  commit_info = None
96
  if dataset_repo:
97
- file_hash = result_json["candidate_id"]
98
  commit_info = persist_to_hf(
99
  dataset_repo=dataset_repo,
100
  record=result_json,
101
  anon_pdf_bytes=anon_pdf_bytes,
102
- parquet_path=f"candidates/{file_hash}.parquet",
103
- json_path=f"candidates/{file_hash}.json",
104
- pdf_path=f"candidates/{file_hash}.anon.pdf",
105
  )
106
 
107
- anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
108
 
109
  return (
110
- json.dumps(result_json, ensure_ascii=False, indent=2), # JSON全体
111
- json.dumps(skills, ensure_ascii=False, indent=2), # skillsは文字列返す
112
- json.dumps(score, ensure_ascii=False, indent=2), # スコアも文字列
113
- summaries.get("300chars", ""),
114
- summaries.get("100chars", ""),
115
- summaries.get("onesent", ""),
116
  anon_pdf,
117
  json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
118
  )
@@ -126,21 +159,21 @@ with gr.Blocks(title=APP_TITLE) as demo:
126
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
127
  file_count="multiple",
128
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
129
- type="file" # fileオブジェクトを受ける(process_resumesの実装に合致)
130
  )
131
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
132
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
133
 
134
- run_btn = gr.Button("実行")
135
 
136
  with gr.Tab("構造化JSON"):
137
- out_json = gr.Code(label="統合出力 (JSON文字列)")
138
 
139
  with gr.Tab("抽出スキル"):
140
- out_skills = gr.Code(label="スキル一覧 (JSON文字列)") # gr.JSON を使わない
141
 
142
  with gr.Tab("品質スコア"):
143
- out_score = gr.Code(label="品質評価 (JSON文字列)")
144
 
145
  with gr.Tab("要約 (300/100/1文)"):
146
  out_sum_300 = gr.Textbox(label="300字要約")
@@ -151,7 +184,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
151
  out_pdf = gr.File(label="匿名PDFダウンロード")
152
 
153
  with gr.Tab("Datasets 保存ログ"):
154
- out_commit = gr.Code(label="コミット情報 (JSON文字列)")
155
 
156
  run_btn.click(
157
  process_resumes,
 
19
 
20
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
 
22
+ # Gradio v4 Filesの入力を安全にハンドリング
23
+ def _iter_files(files):
24
+ """
25
+ Gradio v4:
26
+ - type="filepath": files は 文字列パスのリスト
27
+ - type="binary" : files は {name: str, data: bytes} の辞書リスト
28
+ 互換目的で file-like も許容。
29
+ """
30
+ for f in files:
31
+ # filepath
32
+ if isinstance(f, str):
33
+ path = f
34
+ name = os.path.basename(path)
35
+ with open(path, "rb") as fh:
36
+ data = fh.read()
37
+ yield name, data
38
+ continue
39
+ # binary
40
+ if isinstance(f, dict) and "name" in f and "data" in f:
41
+ name = os.path.basename(f["name"] or "uploaded")
42
+ data = f["data"]
43
+ yield name, data
44
+ continue
45
+ # file-like
46
+ if hasattr(f, "read"):
47
+ try:
48
+ name = os.path.basename(getattr(f, "name", "uploaded"))
49
+ except Exception:
50
+ name = "uploaded"
51
+ data = f.read()
52
+ yield name, data
53
+ continue
54
+ # 不明形式はスキップ
55
+ continue
56
+
57
 
58
  def process_resumes(files, candidate_id: str, additional_notes: str = ""):
59
  if not files:
 
62
  partial_records = []
63
  raw_texts = []
64
 
65
+ for fname, raw_bytes in _iter_files(files):
66
+ filetype = detect_filetype(fname, raw_bytes)
 
67
 
68
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
69
  if filetype in {"pdf", "image"}:
70
+ text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
71
  else:
72
  base_text = load_doc_text(filetype, raw_bytes)
73
+ text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
 
74
 
75
+ raw_texts.append({"filename": fname, "text": text})
76
 
77
+ # 2) OpenAIでセクション構造化 → ルール正規化
78
  structured = structure_with_openai(text)
79
  normalized = normalize_resume({
80
  "work_experience": structured.get("work_experience_raw", ""),
 
83
  "skills": ", ".join(structured.get("skills_list", [])),
84
  })
85
  partial_records.append({
86
+ "source": fname,
87
  "text": text,
88
  "structured": structured,
89
  "normalized": normalized,
 
101
  "skills": ", ".join(merged.get("skills", [])),
102
  })
103
 
104
+ # 5) 匿名化 → 匿名PDF生成
105
  anonymized_text, anon_map = anonymize_text(merged_text)
106
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
107
 
108
  # 6) 品質スコア
109
  score = compute_quality_score(merged_text, merged)
110
 
111
+ # 7) 要約
112
  summaries = summarize_with_openai(merged_text)
113
 
114
+ # 8) 構造化出力
115
+ cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
116
  result_json = {
117
+ "candidate_id": cid,
118
+ "files": [r["source"] for r in partial_records],
119
  "merged": merged,
120
  "skills": skills,
121
  "quality_score": score,
 
124
  "notes": additional_notes,
125
  }
126
 
127
+ # 9) HF Datasets 保存(任意)
128
  dataset_repo = os.environ.get("DATASET_REPO")
129
  commit_info = None
130
  if dataset_repo:
 
131
  commit_info = persist_to_hf(
132
  dataset_repo=dataset_repo,
133
  record=result_json,
134
  anon_pdf_bytes=anon_pdf_bytes,
135
+ parquet_path=f"candidates/{cid}.parquet",
136
+ json_path=f"candidates/{cid}.json",
137
+ pdf_path=f"candidates/{cid}.anon.pdf",
138
  )
139
 
140
+ anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
141
 
142
  return (
143
+ json.dumps(result_json, ensure_ascii=False, indent=2),
144
+ json.dumps(skills, ensure_ascii=False, indent=2), # out_skillsをCodeにしたのでJSON文字列返す
145
+ json.dumps(score, ensure_ascii=False, indent=2),
146
+ summaries["300chars"],
147
+ summaries["100chars"],
148
+ summaries["onesent"],
149
  anon_pdf,
150
  json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
151
  )
 
159
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
160
  file_count="multiple",
161
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
162
+ type="filepath" # ← v4では 'filepath' or 'binary'
163
  )
164
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
165
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
166
 
167
+ run_btn = gr.Button("実行", variant="primary")
168
 
169
  with gr.Tab("構造化JSON"):
170
+ out_json = gr.Code(label="統合出力 (JSON)")
171
 
172
  with gr.Tab("抽出スキル"):
173
+ out_skills = gr.Code(label="スキル一覧 (JSON)") # JSON Schema問題回避
174
 
175
  with gr.Tab("品質スコア"):
176
+ out_score = gr.Code(label="品質評価 (JSON)")
177
 
178
  with gr.Tab("要約 (300/100/1文)"):
179
  out_sum_300 = gr.Textbox(label="300字要約")
 
184
  out_pdf = gr.File(label="匿名PDFダウンロード")
185
 
186
  with gr.Tab("Datasets 保存ログ"):
187
+ out_commit = gr.Code(label="コミット情報")
188
 
189
  run_btn.click(
190
  process_resumes,