File size: 7,206 Bytes
ab638b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3037a1
 
a82717b
87d8093
 
 
 
a82717b
d3037a1
 
 
 
 
87d8093
 
 
 
 
 
 
894c91e
87d8093
 
 
 
d3037a1
87d8093
865007c
d3037a1
87d8093
 
d3037a1
87d8093
d3037a1
87d8093
d3037a1
 
 
 
 
 
 
 
87d8093
d3037a1
 
 
 
 
87d8093
ab638b2
 
87d8093
ab638b2
 
 
 
 
 
 
d3037a1
87d8093
ab638b2
 
 
87d8093
ab638b2
 
87d8093
ab638b2
 
87d8093
d3037a1
 
87d8093
d3037a1
 
 
 
 
 
 
 
87d8093
ab638b2
 
 
 
 
 
 
 
 
 
 
 
 
87d8093
ab638b2
d3037a1
87d8093
d3037a1
87d8093
 
 
 
 
 
d3037a1
 
 
 
 
87d8093
ab638b2
 
 
 
 
 
 
87d8093
ab638b2
 
 
 
 
 
 
 
 
 
87d8093
fcf00ce
ab638b2
 
87d8093
ab638b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3037a1
87d8093
fcf00ce
 
 
87d8093
fcf00ce
87d8093
fcf00ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import io
import json
import hashlib
import gradio as gr

from pipelines.openai_ingest import (
    extract_text_with_openai,
    structure_with_openai,
    summarize_with_openai,
)
from pipelines.parsing import normalize_resume
from pipelines.merge import merge_normalized_records
from pipelines.skills import extract_skills
from pipelines.anonymize import anonymize_text, render_anonymized_pdf
from pipelines.scoring import compute_quality_score
from pipelines.storage import persist_to_hf
from pipelines.utils import detect_filetype, load_doc_text

APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"


def process_resumes(files, candidate_id: str, additional_notes: str = ""):
    """
    files: gr.Files(type="filepath") から渡る「ファイルパスのリスト」
    返り値は Gradio の API スキーマ生成エラーを避けるため、**全て文字列 or ファイル**に統一する。
    """
    if not files:
        raise gr.Error("少なくとも1ファイルをアップロードしてください。")

    partial_records = []
    raw_texts = []

    # Files(type="filepath") → files はパスのリスト
    for path in files:
        try:
            with open(path, "rb") as rf:
                raw_bytes = rf.read()
        except Exception as e:
            raise gr.Error(f"ファイル読み込みに失敗しました: {path}: {e}")

        fname = os.path.basename(path)
        filetype = detect_filetype(fname, raw_bytes)

        # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
        if filetype in {"pdf", "image"}:
            text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
        else:
            base_text = load_doc_text(filetype, raw_bytes)
            # 生テキストをそのままOpenAIへ渡し、軽く整形した全文を返す
            text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")

        raw_texts.append({"filename": fname, "text": text})

        # 2) OpenAIでセクション構造化 → ルールベース正規化
        structured = structure_with_openai(text)
        normalized = normalize_resume({
            "work_experience": structured.get("work_experience_raw", ""),
            "education": structured.get("education_raw", ""),
            "certifications": structured.get("certifications_raw", ""),
            "skills": ", ".join(structured.get("skills_list", [])),
        })
        partial_records.append({
            "source": fname,
            "text": text,
            "structured": structured,
            "normalized": normalized,
        })

    # 3) 統合(複数ファイル→1候補者)
    merged = merge_normalized_records([r["normalized"] for r in partial_records])

    # 4) スキル抽出(辞書/正規表現)
    merged_text = "\n\n".join([r["text"] for r in partial_records])
    skills = extract_skills(merged_text, {
        "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
        "education": merged.get("raw_sections", {}).get("education", ""),
        "certifications": merged.get("raw_sections", {}).get("certifications", ""),
        "skills": ", ".join(merged.get("skills", [])),
    })

    # 5) 匿名化
    anonymized_text, anon_map = anonymize_text(merged_text)
    anon_pdf_bytes = render_anonymized_pdf(anonymized_text)

    # 6) 品質スコア
    score = compute_quality_score(merged_text, merged)

    # 7) 要約(300/100/1文)
    summaries = summarize_with_openai(merged_text)

    # 8) 構造化出力(文字列化して返す)
    result_json = {
        "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
        "files": [os.path.basename(p) for p in files],
        "merged": merged,
        "skills": skills,
        "quality_score": score,
        "summaries": summaries,
        "anonymization_map": anon_map,
        "notes": additional_notes,
    }

    # 9) HF Datasets 保存
    dataset_repo = os.environ.get("DATASET_REPO")
    commit_info = None
    if dataset_repo:
        file_hash = result_json["candidate_id"]
        commit_info = persist_to_hf(
            dataset_repo=dataset_repo,
            record=result_json,
            anon_pdf_bytes=anon_pdf_bytes,
            parquet_path=f"candidates/{file_hash}.parquet",
            json_path=f"candidates/{file_hash}.json",
            pdf_path=f"candidates/{file_hash}.anon.pdf",
        )

    # gr.File 用の (filename, bytes) タプル
    anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)

    # 返り値は**すべて文字列**(と1つのファイル)に統一
    return (
        json.dumps(result_json, ensure_ascii=False, indent=2),
        json.dumps(skills, ensure_ascii=False, indent=2),
        json.dumps(score, ensure_ascii=False, indent=2),
        summaries.get("300chars", ""),
        summaries.get("100chars", ""),
        summaries.get("onesent", ""),
        anon_pdf,
        json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
    )


with gr.Blocks(title=APP_TITLE) as demo:
    gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")

    with gr.Row():
        in_files = gr.Files(
            label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
            file_count="multiple",
            file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
            type="filepath",  # ← 重要: 'file' は無効。'filepath' か 'binary'
        )
        candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
    notes = gr.Textbox(label="補足メモ(任意)", lines=3)

    run_btn = gr.Button("実行")

    with gr.Tab("構造化JSON"):
        out_json = gr.Code(label="統合出力 (JSON)")

    with gr.Tab("抽出スキル"):
        # gr.JSON は API スキーマ生成で例外が出るケースがあるため回避し、文字列(JSON)を表示
        out_skills = gr.Code(label="スキル一覧(JSON表示)")

    with gr.Tab("品質スコア"):
        out_score = gr.Code(label="品質評価(JSON)")

    with gr.Tab("要約 (300/100/1文)"):
        out_sum_300 = gr.Textbox(label="300字要約")
        out_sum_100 = gr.Textbox(label="100字要約")
        out_sum_1 = gr.Textbox(label="1文要約")

    with gr.Tab("匿名PDF"):
        out_pdf = gr.File(label="匿名PDFダウンロード")

    with gr.Tab("Datasets 保存ログ"):
        out_commit = gr.Code(label="コミット情報")

    run_btn.click(
        process_resumes,
        inputs=[in_files, candidate_id, notes],
        outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
    )


if __name__ == "__main__":
    # Spaces 等で localhost 非公開環境を考慮
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", "7860")),
        share=True,
        show_error=True,
        analytics_enabled=False,
    )