Corin1998 commited on
Commit
fcf00ce
·
verified ·
1 Parent(s): ab638b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -21
app.py CHANGED
@@ -28,23 +28,21 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
28
  raw_texts = []
29
 
30
  for f in files:
31
- # gr.Files(type="binary") では .read() / .name が利用可能
32
  raw_bytes = f.read()
33
  filetype = detect_filetype(f.name, raw_bytes)
34
 
35
- # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
36
  if filetype in {"pdf", "image"}:
37
  text = extract_text_with_openai(raw_bytes, filename=f.name, filetype=filetype)
38
  else:
39
  base_text = load_doc_text(filetype, raw_bytes)
40
- # 生テキストをそのままOpenAIへ渡し、軽く整形した全文を返す
41
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=f.name, filetype="txt")
42
 
43
  raw_texts.append({"filename": f.name, "text": text})
44
 
45
- # 2) OpenAIでセクション構造化
46
  structured = structure_with_openai(text)
47
- # 念のためルールベース正規化も適用(期間抽出など補助)
48
  normalized = normalize_resume({
49
  "work_experience": structured.get("work_experience_raw", ""),
50
  "education": structured.get("education_raw", ""),
@@ -58,10 +56,10 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
58
  "normalized": normalized,
59
  })
60
 
61
- # 3) 統合(複数ファイル→1候補者)
62
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
63
 
64
- # 4) スキル抽出(辞書/正規表現)
65
  merged_text = "\n\n".join([r["text"] for r in partial_records])
66
  skills = extract_skills(merged_text, {
67
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
@@ -70,17 +68,17 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
70
  "skills": ", ".join(merged.get("skills", [])),
71
  })
72
 
73
- # 5) 匿名化
74
  anonymized_text, anon_map = anonymize_text(merged_text)
75
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
76
 
77
- # 6) 品質スコア
78
  score = compute_quality_score(merged_text, merged)
79
 
80
- # 7) 要約(300/100/1文)
81
  summaries = summarize_with_openai(merged_text)
82
 
83
- # 8) 構造化出力
84
  result_json = {
85
  "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
86
  "files": [f.name for f in files],
@@ -92,7 +90,7 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
92
  "notes": additional_notes,
93
  }
94
 
95
- # 9) HF Datasets 保存
96
  dataset_repo = os.environ.get("DATASET_REPO")
97
  commit_info = None
98
  if dataset_repo:
@@ -106,12 +104,15 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
106
  pdf_path=f"candidates/{file_hash}.anon.pdf",
107
  )
108
 
 
109
  anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
110
 
 
 
111
  return (
112
- json.dumps(result_json, ensure_ascii=False, indent=2),
113
- skills,
114
- json.dumps(score, ensure_ascii=False, indent=2),
115
  summaries["300chars"],
116
  summaries["100chars"],
117
  summaries["onesent"],
@@ -120,7 +121,7 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
120
  )
121
 
122
 
123
- with gr.Blocks(title=APP_TITLE) as demo:
124
  gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
125
 
126
  with gr.Row():
@@ -128,7 +129,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
128
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
129
  file_count="multiple",
130
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
131
- type="binary", # Gradio 4.44系:'binary' or 'filepath'
132
  )
133
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
134
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
@@ -139,10 +140,10 @@ with gr.Blocks(title=APP_TITLE) as demo:
139
  out_json = gr.Code(label="統合出力 (JSON)")
140
 
141
  with gr.Tab("抽出スキル"):
142
- out_skills = gr.JSON(label="スキル一覧")
143
 
144
  with gr.Tab("品質スコア"):
145
- out_score = gr.Code(label="品質評価")
146
 
147
  with gr.Tab("要約 (300/100/1文)"):
148
  out_sum_300 = gr.Textbox(label="300字要約")
@@ -163,5 +164,10 @@ with gr.Blocks(title=APP_TITLE) as demo:
163
 
164
 
165
  if __name__ == "__main__":
166
- # Hugging Face Spaces では share=True は不要/非推奨
167
- demo.launch()
 
 
 
 
 
 
28
  raw_texts = []
29
 
30
  for f in files:
31
+ # gr.Files(type="binary"): .read() / .name が利用可能
32
  raw_bytes = f.read()
33
  filetype = detect_filetype(f.name, raw_bytes)
34
 
35
+ # 1) テキスト抽出
36
  if filetype in {"pdf", "image"}:
37
  text = extract_text_with_openai(raw_bytes, filename=f.name, filetype=filetype)
38
  else:
39
  base_text = load_doc_text(filetype, raw_bytes)
 
40
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=f.name, filetype="txt")
41
 
42
  raw_texts.append({"filename": f.name, "text": text})
43
 
44
+ # 2) 構造化 -> 3) 正規化
45
  structured = structure_with_openai(text)
 
46
  normalized = normalize_resume({
47
  "work_experience": structured.get("work_experience_raw", ""),
48
  "education": structured.get("education_raw", ""),
 
56
  "normalized": normalized,
57
  })
58
 
59
+ # 4) 複数ファイル統合
60
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
61
 
62
+ # 5) スキル抽出
63
  merged_text = "\n\n".join([r["text"] for r in partial_records])
64
  skills = extract_skills(merged_text, {
65
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
 
68
  "skills": ", ".join(merged.get("skills", [])),
69
  })
70
 
71
+ # 6) 匿名化
72
  anonymized_text, anon_map = anonymize_text(merged_text)
73
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
74
 
75
+ # 7) 品質スコア
76
  score = compute_quality_score(merged_text, merged)
77
 
78
+ # 8) 要約
79
  summaries = summarize_with_openai(merged_text)
80
 
81
+ # 9) 構造化出力(最終JSON)
82
  result_json = {
83
  "candidate_id": candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16],
84
  "files": [f.name for f in files],
 
90
  "notes": additional_notes,
91
  }
92
 
93
+ # 10) HF Datasets 保存(任意)
94
  dataset_repo = os.environ.get("DATASET_REPO")
95
  commit_info = None
96
  if dataset_repo:
 
104
  pdf_path=f"candidates/{file_hash}.anon.pdf",
105
  )
106
 
107
+ # gr.File には (filename, bytes) を返す
108
  anon_pdf = (result_json["candidate_id"] + ".anon.pdf", anon_pdf_bytes)
109
 
110
+ # ⚠️ gr.JSON は 4.44 で API スキーマ生成がコケる事があるため
111
+ # 画面表示用はすべて「文字列」にして gr.Code へ渡す
112
  return (
113
+ json.dumps(result_json, ensure_ascii=False, indent=2), # out_json -> Code
114
+ json.dumps(skills, ensure_ascii=False, indent=2), # out_skills -> Code
115
+ json.dumps(score, ensure_ascii=False, indent=2), # out_score -> Code
116
  summaries["300chars"],
117
  summaries["100chars"],
118
  summaries["onesent"],
 
121
  )
122
 
123
 
124
+ with gr.Blocks(title=APP_TITLE, analytics_enabled=False) as demo:
125
  gr.Markdown(f"# {APP_TITLE}\n複数ファイルを統合→OpenAIで読み込み/構造化/要約→匿名化→Datasets保存")
126
 
127
  with gr.Row():
 
129
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
130
  file_count="multiple",
131
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
132
+ type="binary", # 4.44系は 'binary' or 'filepath'
133
  )
134
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
135
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
 
140
  out_json = gr.Code(label="統合出力 (JSON)")
141
 
142
  with gr.Tab("抽出スキル"):
143
+ out_skills = gr.Code(label="スキル一覧(JSON表示)")
144
 
145
  with gr.Tab("品質スコア"):
146
+ out_score = gr.Code(label="品質評価(JSON表示)")
147
 
148
  with gr.Tab("要約 (300/100/1文)"):
149
  out_sum_300 = gr.Textbox(label="300字要約")
 
164
 
165
 
166
  if __name__ == "__main__":
167
+ # Spaces 等のPaaSで localhost アクセス不可な環境に合わせて明示
168
+ demo.launch(
169
+ server_name="0.0.0.0",
170
+ server_port=int(os.environ.get("PORT", "7860")),
171
+ share=True, # 必要環境での起動失敗を回避
172
+ show_error=True,
173
+ )