Corin1998 commited on
Commit
d3037a1
·
verified ·
1 Parent(s): 76f9ba6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import hashlib
5
+ import gradio as gr
6
+
7
+ from pipelines.openai_ingest import (
8
+ extract_text_with_openai,
9
+ structure_with_openai,
10
+ summarize_with_openai,
11
+ )
12
+ from pipelines.parsing import normalize_resume
13
+ from pipelines.merge import merge_normalized_records
14
+ from pipelines.skills import extract_skills
15
+ from pipelines.anonymize import anonymize_text, render_anonymized_pdf
16
+ from pipelines.scoring import compute_quality_score
17
+ from pipelines.storage import persist_to_hf
18
+ from pipelines.utils import detect_filetype, load_doc_text
19
+
20
+ APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
+
22
+
23
+ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
24
+ if not files:
25
+ raise gr.Error("少なくとも1ファイルをアップロードしてください。")
26
+
27
+ partial_records = []
28
+ raw_texts = []
29
+
30
+ for f in files:
31
+ raw_bytes = f.read()
32
+ filetype = detect_filetype(f.name, raw_bytes)
33
+
34
+ # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
35
+ if filetype in {"pdf", "image"}:
36
+ text = extract_text_with_openai(raw_bytes, filename=f.name, filetype=filetype)
37
+ else:
38
+ base_text = load_doc_text(filetype, raw_bytes)
39
+ # 生テキストをそのままOpenAIへ渡し、軽く整形した全文を返す
40
+ text = extract_text_with_openai(base_text.encode("utf-8"), filename=f.name, filetype="txt")
41
+
42
+ raw_texts.append({"filename": f.name, "text": text})
43
+
44
+ # 2) OpenAIでセクション構造化
45
+ structured = structure_with_openai(text)
46
+ # 念のためルールベース正規化も適用(期間抽出など補助)
47
+ normalized = normalize_resume({
48
+ "work_experience": structured.get("work_experience_raw", ""),
49
+ "education": structured.get("education_raw", ""),
50
+ "certifications": structured.get("certifications_raw", ""),
51
+ "skills": ", ".join(structured.get("skills_list", [])),
52
+ })
53
+ partial_records.append({
54
+ "source": f.name,
55
+ "text": text,
56
+ "structured": structured,
57
+ "normalized": normalized,
58
+ })
59
+
60
+ # 3) 統合(複数ファイル→1候補者)
61
+ merged = merge_normalized_records([r["normalized"] for r in partial_records])
62
+
63
+ # 4) スキル抽出(辞書/正規表現)
64
+ merged_text = "\n\n".join([r["text"] for r in partial_records])
65
+ skills = extract_skills(merged_text, {
66
+ "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
67
+ "education": merged.get("raw_sections", {}).get("education", ""),
68
+ "certifications": merged.get("raw_sections", {}).get("certifications", ""),
69
+ "skills": ", ".join(merged.get("skills", [])),
70
+ })
71
+
72
+ # 5) 匿名化
73
+ anonymized_text, anon_map = anonymize_text(merged_text)
74
+ anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
75
+
76
+ # 6) 品質スコア
77
+ score = compute_quality_score(merged_text, merged)
78
+
79
+ # 7) 要約(300/100/1文)
80
+ summaries = summarize_with_openai(merged_text)
81
+
82
+ # 8) 構造化出力
83
+ result_json = {
84
+ "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
85
+ "files": [f.name for f in files],
86
+ "merged": merged,
87
+ "skills": skills,
88
+ "quality_score": score,
89
+ "summaries": summaries,
90
+ "anonymization_map": anon_map,
91
+ "notes": additional_notes,
92
+ }
93
+
94
+ # 9) HF Datasets 保存
95
+ dataset_repo = os.environ.get("DATASET_REPO")
96
+ commit_info = None
97
+ if dataset_repo:
98
+ file_hash = result_json["candidate_id"]
99
+ commit_info = persist_to_hf(
100
+ dataset_repo=dataset_repo,
101
+ record=result_json,
102
+ anon_pdf_bytes=anon_pdf_bytes,
103
+ parquet_path=f"candidates/{file_hash}.parquet",
104
+ json_path=f"candidates/{file_hash}.json",
105
+ pdf_path=f"candidates/{file_hash}.anon.pdf",
106
+ )
107
+
108
+ anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
109
+
110
+ return (
111
+ json.dumps(result_json, ensure_ascii=False, indent=2),
112
+ skills,
113
+ json.dumps(score, ensure_ascii=False, indent=2),
114
+ summaries["300chars"],
115
+ summaries["100chars"],
116
+ summaries["onesent"],
117
+ anon_pdf,
118
+ json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
119
+ )
120
+
121
+
122
+ with gr.Blocks(title=APP_TITLE) as demo:
123
+ gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
124
+
125
+ with gr.Row():
126
+ in_files = gr.Files(label="レジュメ類 (PDF/画像/Word/テキスト) 複数可", file_count="multiple",
127
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"], type="file")
128
+ candidate_id = gr.Textbox(label="候��者ID(任意。未入力なら自動生成)")
129
+ notes = gr.Textbox(label="補足メモ(任意)", lines=3)
130
+
131
+ run_btn = gr.Button("実行")
132
+
133
+ with gr.Tab("構造化JSON"):
134
+ out_json = gr.Code(label="統合出力 (JSON)")
135
+
136
+ with gr.Tab("抽出スキル"):
137
+ out_skills = gr.JSON(label="スキル一覧")
138
+
139
+ with gr.Tab("品質スコア"):
140
+ out_score = gr.Code(label="品質評価")
141
+
142
+ with gr.Tab("要約 (300/100/1文)"):
143
+ out_sum_300 = gr.Textbox(label="300字要約")
144
+ out_sum_100 = gr.Textbox(label="100字要約")
145
+ out_sum_1 = gr.Textbox(label="1文要約")
146
+
147
+ with gr.Tab("匿名PDF"):
148
+ out_pdf = gr.File(label="匿名PDFダウンロード")
149
+
150
+ with gr.Tab("Datasets 保存ログ"):
151
+ out_commit = gr.Code(label="コミット情報")
152
+
153
+ run_btn.click(
154
+ process_resumes,
155
+ inputs=[in_files, candidate_id, notes],
156
+ outputs=[out_json, out_skills, out_score, out_sum_300, out_sum_100, out_sum_1, out_pdf, out_commit],
157
+ )
158
+
159
+
160
+ if __name__ == "__main__":
161
+ demo.launch()