Corin1998 commited on
Commit
e8bbd9b
·
verified ·
1 Parent(s): aa423bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -55
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import io
3
  import json
4
  import hashlib
 
5
  import gradio as gr
6
 
7
  from pipelines.openai_ingest import (
@@ -19,40 +20,18 @@ from pipelines.utils import detect_filetype, load_doc_text
19
 
20
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
21
 
22
- # Gradio v4 Filesの入力を安全にハンドリング
23
- def _iter_files(files):
24
- """
25
- Gradio v4:
26
- - type="filepath": files文字列パスのリスト
27
- - type="binary" : files は {name: str, data: bytes} の辞書リスト
28
- 互換目的で file-like も許容。
29
  """
30
- for f in files:
31
- # filepath
32
- if isinstance(f, str):
33
- path = f
34
- name = os.path.basename(path)
35
- with open(path, "rb") as fh:
36
- data = fh.read()
37
- yield name, data
38
- continue
39
- # binary
40
- if isinstance(f, dict) and "name" in f and "data" in f:
41
- name = os.path.basename(f["name"] or "uploaded")
42
- data = f["data"]
43
- yield name, data
44
- continue
45
- # file-like
46
- if hasattr(f, "read"):
47
- try:
48
- name = os.path.basename(getattr(f, "name", "uploaded"))
49
- except Exception:
50
- name = "uploaded"
51
- data = f.read()
52
- yield name, data
53
- continue
54
- # 不明形式はスキップ
55
- continue
56
 
57
 
58
  def process_resumes(files, candidate_id: str, additional_notes: str = ""):
@@ -62,20 +41,30 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
62
  partial_records = []
63
  raw_texts = []
64
 
65
- for fname, raw_bytes in _iter_files(files):
 
66
  filetype = detect_filetype(fname, raw_bytes)
67
 
68
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
69
  if filetype in {"pdf", "image"}:
70
  text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
71
- else:
72
  base_text = load_doc_text(filetype, raw_bytes)
 
 
 
 
 
 
 
 
73
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
74
 
75
  raw_texts.append({"filename": fname, "text": text})
76
 
77
- # 2) OpenAIでセクション構造化 → ルール正規化
78
  structured = structure_with_openai(text)
 
79
  normalized = normalize_resume({
80
  "work_experience": structured.get("work_experience_raw", ""),
81
  "education": structured.get("education_raw", ""),
@@ -92,7 +81,7 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
92
  # 3) 統合(複数ファイル→1候補者)
93
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
94
 
95
- # 4) スキル抽出
96
  merged_text = "\n\n".join([r["text"] for r in partial_records])
97
  skills = extract_skills(merged_text, {
98
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
@@ -101,20 +90,20 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
101
  "skills": ", ".join(merged.get("skills", [])),
102
  })
103
 
104
- # 5) 匿名化 → 匿名PDF生成
105
  anonymized_text, anon_map = anonymize_text(merged_text)
106
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
107
 
108
  # 6) 品質スコア
109
  score = compute_quality_score(merged_text, merged)
110
 
111
- # 7) 要約
112
  summaries = summarize_with_openai(merged_text)
113
 
114
  # 8) 構造化出力
115
- cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
116
  result_json = {
117
- "candidate_id": cid,
118
  "files": [r["source"] for r in partial_records],
119
  "merged": merged,
120
  "skills": skills,
@@ -124,28 +113,30 @@ def process_resumes(files, candidate_id: str, additional_notes: str = ""):
124
  "notes": additional_notes,
125
  }
126
 
127
- # 9) HF Datasets 保存(任意)
128
  dataset_repo = os.environ.get("DATASET_REPO")
129
  commit_info = None
130
  if dataset_repo:
 
131
  commit_info = persist_to_hf(
132
  dataset_repo=dataset_repo,
133
  record=result_json,
134
  anon_pdf_bytes=anon_pdf_bytes,
135
- parquet_path=f"candidates/{cid}.parquet",
136
- json_path=f"candidates/{cid}.json",
137
- pdf_path=f"candidates/{cid}.anon.pdf",
138
  )
139
 
140
- anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
141
 
 
142
  return (
143
  json.dumps(result_json, ensure_ascii=False, indent=2),
144
- json.dumps(skills, ensure_ascii=False, indent=2), # ← out_skillsをCodeにしたのでJSON文字列を返す
145
  json.dumps(score, ensure_ascii=False, indent=2),
146
- summaries["300chars"],
147
- summaries["100chars"],
148
- summaries["onesent"],
149
  anon_pdf,
150
  json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
151
  )
@@ -159,21 +150,22 @@ with gr.Blocks(title=APP_TITLE) as demo:
159
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
160
  file_count="multiple",
161
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
162
- type="filepath" # ここを修正
163
  )
164
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
165
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
166
 
167
- run_btn = gr.Button("実行", variant="primary")
168
 
169
  with gr.Tab("構造化JSON"):
170
  out_json = gr.Code(label="統合出力 (JSON)")
171
 
172
  with gr.Tab("抽出スキル"):
173
- out_skills = gr.Code(label="スキル一覧 (JSON)") # JSON Schema問題回避
 
174
 
175
  with gr.Tab("品質スコア"):
176
- out_score = gr.Code(label="品質評価 (JSON)")
177
 
178
  with gr.Tab("要約 (300/100/1文)"):
179
  out_sum_300 = gr.Textbox(label="300字要約")
@@ -194,4 +186,9 @@ with gr.Blocks(title=APP_TITLE) as demo:
194
 
195
 
196
  if __name__ == "__main__":
197
- demo.launch()
 
 
 
 
 
 
2
  import io
3
  import json
4
  import hashlib
5
+ import pathlib
6
  import gradio as gr
7
 
8
  from pipelines.openai_ingest import (
 
20
 
21
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
22
 
23
+
24
+ def _read_file_obj_or_path(f):
25
+ """gr.Files v4 用の堅牢リーダ。
26
+ - type="filepath" の場合: f は str/Path
27
+ - type="binary" の場合: f UploadedFile ライク(.name/.read())
28
+ 戻り値: (filename, bytes)
 
29
  """
30
+ if isinstance(f, (str, pathlib.Path)):
31
+ p = pathlib.Path(f)
32
+ return p.name, p.read_bytes()
33
+ # UploadedFile 互換
34
+ return getattr(f, "name", "uploaded"), f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def process_resumes(files, candidate_id: str, additional_notes: str = ""):
 
41
  partial_records = []
42
  raw_texts = []
43
 
44
+ for f in files:
45
+ fname, raw_bytes = _read_file_obj_or_path(f)
46
  filetype = detect_filetype(fname, raw_bytes)
47
 
48
  # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
49
  if filetype in {"pdf", "image"}:
50
  text = extract_text_with_openai(raw_bytes, filename=fname, filetype=filetype)
51
+ elif filetype in {"docx", "txt"}:
52
  base_text = load_doc_text(filetype, raw_bytes)
53
+ # 生テキストをOpenAIへ渡し、整形した全文を返す
54
+ text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
55
+ else:
56
+ # フォールバック:そのままテキスト化を試み、ダメならエラーを添えて続行
57
+ try:
58
+ base_text = raw_bytes.decode("utf-8", errors="ignore")
59
+ except Exception:
60
+ base_text = ""
61
  text = extract_text_with_openai(base_text.encode("utf-8"), filename=fname, filetype="txt")
62
 
63
  raw_texts.append({"filename": fname, "text": text})
64
 
65
+ # 2) OpenAIでセクション構造化
66
  structured = structure_with_openai(text)
67
+ # 念のためルールベース正規化も適用(期間抽出など補助)
68
  normalized = normalize_resume({
69
  "work_experience": structured.get("work_experience_raw", ""),
70
  "education": structured.get("education_raw", ""),
 
81
  # 3) 統合(複数ファイル→1候補者)
82
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
83
 
84
+ # 4) スキル抽出(辞書/正規表現)
85
  merged_text = "\n\n".join([r["text"] for r in partial_records])
86
  skills = extract_skills(merged_text, {
87
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
 
90
  "skills": ", ".join(merged.get("skills", [])),
91
  })
92
 
93
+ # 5) 匿名化
94
  anonymized_text, anon_map = anonymize_text(merged_text)
95
  anon_pdf_bytes = render_anonymized_pdf(anonymized_text)
96
 
97
  # 6) 品質スコア
98
  score = compute_quality_score(merged_text, merged)
99
 
100
+ # 7) 要約(300/100/1文)
101
  summaries = summarize_with_openai(merged_text)
102
 
103
  # 8) 構造化出力
104
+ candidate_id_final = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
105
  result_json = {
106
+ "candidate_id": candidate_id_final,
107
  "files": [r["source"] for r in partial_records],
108
  "merged": merged,
109
  "skills": skills,
 
113
  "notes": additional_notes,
114
  }
115
 
116
+ # 9) HF Datasets 保存
117
  dataset_repo = os.environ.get("DATASET_REPO")
118
  commit_info = None
119
  if dataset_repo:
120
+ file_hash = result_json["candidate_id"]
121
  commit_info = persist_to_hf(
122
  dataset_repo=dataset_repo,
123
  record=result_json,
124
  anon_pdf_bytes=anon_pdf_bytes,
125
+ parquet_path=f"candidates/{file_hash}.parquet",
126
+ json_path=f"candidates/{file_hash}.json",
127
+ pdf_path=f"candidates/{file_hash}.anon.pdf",
128
  )
129
 
130
+ anon_pdf = (candidate_id_final + ".anon.pdf", anon_pdf_bytes)
131
 
132
+ # 重要: gr.JSON を避けるため JSON文字列で返す
133
  return (
134
  json.dumps(result_json, ensure_ascii=False, indent=2),
135
+ json.dumps(skills, ensure_ascii=False, indent=2),
136
  json.dumps(score, ensure_ascii=False, indent=2),
137
+ summaries.get("300chars", ""),
138
+ summaries.get("100chars", ""),
139
+ summaries.get("onesent", ""),
140
  anon_pdf,
141
  json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2),
142
  )
 
150
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
151
  file_count="multiple",
152
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
153
+ type="filepath", # v4仕様: 'file' は無効
154
  )
155
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
156
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
157
 
158
+ run_btn = gr.Button("実行")
159
 
160
  with gr.Tab("構造化JSON"):
161
  out_json = gr.Code(label="統合出力 (JSON)")
162
 
163
  with gr.Tab("抽出スキル"):
164
+ # gr.JSON Gradio v4 の API 情報生成で例外を起こすケースがあるため避ける
165
+ out_skills = gr.Code(label="スキル一覧(JSON)")
166
 
167
  with gr.Tab("品質スコア"):
168
+ out_score = gr.Code(label="品質評価(JSON)")
169
 
170
  with gr.Tab("要約 (300/100/1文)"):
171
  out_sum_300 = gr.Textbox(label="300字要約")
 
186
 
187
 
188
  if __name__ == "__main__":
189
+ try:
190
+ # 通常はローカル公開で起動
191
+ demo.launch(server_name="0.0.0.0", server_port=7860)
192
+ except ValueError:
193
+ # 実行環境が localhost にアクセスできない場合のフォールバック
194
+ demo.launch(share=True)