Spaces:
Runtime error
Runtime error
File size: 7,206 Bytes
ab638b2 d3037a1 a82717b 87d8093 a82717b d3037a1 87d8093 894c91e 87d8093 d3037a1 87d8093 865007c d3037a1 87d8093 d3037a1 87d8093 d3037a1 87d8093 d3037a1 87d8093 d3037a1 87d8093 ab638b2 87d8093 ab638b2 d3037a1 87d8093 ab638b2 87d8093 ab638b2 87d8093 ab638b2 87d8093 d3037a1 87d8093 d3037a1 87d8093 ab638b2 87d8093 ab638b2 d3037a1 87d8093 d3037a1 87d8093 d3037a1 87d8093 ab638b2 87d8093 ab638b2 87d8093 fcf00ce ab638b2 87d8093 ab638b2 d3037a1 87d8093 fcf00ce 87d8093 fcf00ce 87d8093 fcf00ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import os
import io
import json
import hashlib
import gradio as gr
from pipelines.openai_ingest import (
extract_text_with_openai,
structure_with_openai,
summarize_with_openai,
)
from pipelines.parsing import normalize_resume
from pipelines.merge import merge_normalized_records
from pipelines.skills import extract_skills
from pipelines.anonymize import anonymize_text, render_anonymized_pdf
from pipelines.scoring import compute_quality_score
from pipelines.storage import persist_to_hf
from pipelines.utils import detect_filetype, load_doc_text
APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
def process_resumes(files, candidate_id: str, additional_notes: str = ""):
"""
files: gr.Files(type="filepath") から渡る「ファイルパスのリスト」
返り値は Gradio の API スキーマ生成エラーを避けるため、**全て文字列 or ファイル**に統一する。
"""
if not files:
raise gr.Error("少なくとも1ファイルをアップロードしてください。")
partial_records = []
raw_texts = []
# Files(type="filepath") → files はパスのリスト
for path in files:
try:
with open(path, "rb") as rf:
raw_bytes = rf.read()
except Exception as e:
raise gr.Error(f"ファイル読み込みに失敗しました: {path}: {e}")
fname = os.path.basename(path)
filetype = detect_filetype(fname, raw_bytes)
# 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
if filetype in {"pdf", "image"}:
text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
else:
base_text = load_doc_text(filetype, raw_bytes)
# 生テキストをそのままOpenAIへ渡し、軽く整形した全文を返す
text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
raw_texts.append({"filename": fname, "text": text})
# 2) OpenAIでセクション構造化 → ルールベース正規化
structured = structure_with_openai(text)
normalized = normalize_resume({
"work_experience": structured.get("work_experience_raw", ""),
"education": structured.get("education_raw", ""),
"certifications": structured.get("certifications_raw", ""),
"skills": ", ".join(structured.get("skills_list", [])),
})
partial_records.append({
"source": fname,
"text": text,
"structured": structured,
"normalized": normalized,
})
# 3) 統合(複数ファイル→1候補者)
merged = merge_normalized_records([r["normalized"] for r in partial_records])
# 4) スキル抽出(辞書/正規表現)
merged_text = "\n\n".join([r["text"] for r in partial_records])
skills = extract_skills(merged_text, {
"work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
"education": merged.get("raw_sections", {}).get("education", ""),
"certifications": merged.get("raw_sections", {}).get("certifications", ""),
"skills": ", ".join(merged.get("skills", [])),
})
# 5) 匿名化
anonymized_text, anon_map = anonymize_text(merged_text)
anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
# 6) 品質スコア
score = compute_quality_score(merged_text, merged)
# 7) 要約(300/100/1文)
summaries = summarize_with_openai(merged_text)
# 8) 構造化出力(文字列化して返す)
result_json = {
"candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
"files": [os.path.basename(p) for p in files],
"merged": merged,
"skills": skills,
"quality_score": score,
"summaries": summaries,
"anonymization_map": anon_map,
"notes": additional_notes,
}
# 9) HF Datasets 保存
dataset_repo = os.environ.get("DATASET_REPO")
commit_info = None
if dataset_repo:
file_hash = result_json["candidate_id"]
commit_info = persist_to_hf(
dataset_repo=dataset_repo,
record=result_json,
anon_pdf_bytes=anon_pdf_bytes,
parquet_path=f"candidates/{file_hash}.parquet",
json_path=f"candidates/{file_hash}.json",
pdf_path=f"candidates/{file_hash}.anon.pdf",
)
# gr.File 用の (filename, bytes) タプル
anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
# 返り値は**すべて文字列**(と1つのファイル)に統一
return (
json.dumps(result_json, ensure_ascii=False, indent=2),
json.dumps(skills, ensure_ascii=False, indent=2),
json.dumps(score, ensure_ascii=False, indent=2),
summaries.get("300chars", ""),
summaries.get("100chars", ""),
summaries.get("onesent", ""),
anon_pdf,
json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
)
with gr.Blocks(title=APP_TITLE) as demo:
gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
with gr.Row():
in_files = gr.Files(
label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
file_count="multiple",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
type="filepath", # ← 重要: 'file' は無効。'filepath' か 'binary'
)
candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
notes = gr.Textbox(label="補足メモ(任意)", lines=3)
run_btn = gr.Button("実行")
with gr.Tab("構造化JSON"):
out_json = gr.Code(label="統合出力 (JSON)")
with gr.Tab("抽出スキル"):
# gr.JSON は API スキーマ生成で例外が出るケースがあるため回避し、文字列(JSON)を表示
out_skills = gr.Code(label="スキル一覧(JSON表示)")
with gr.Tab("品質スコア"):
out_score = gr.Code(label="品質評価(JSON)")
with gr.Tab("要約 (300/100/1文)"):
out_sum_300 = gr.Textbox(label="300字要約")
out_sum_100 = gr.Textbox(label="100字要約")
out_sum_1 = gr.Textbox(label="1文要約")
with gr.Tab("匿名PDF"):
out_pdf = gr.File(label="匿名PDFダウンロード")
with gr.Tab("Datasets 保存ログ"):
out_commit = gr.Code(label="コミット情報")
run_btn.click(
process_resumes,
inputs=[in_files, candidate_id, notes],
outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
)
if __name__ == "__main__":
# Spaces 等で localhost 非公開環境を考慮
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", "7860")),
share=True,
show_error=True,
analytics_enabled=False,
)
|