Corin1998 commited on
Commit
ccf8168
·
verified ·
1 Parent(s): 4b92cf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -24
app.py CHANGED
@@ -2,9 +2,9 @@ import os
2
  import io
3
  import json
4
  import hashlib
5
- import gradio # 前方参照の解決(必要なら)
6
  import gradio as gr
7
- from typing import Any, List
8
 
9
  from pipelines.openai_ingest import (
10
  extract_text_with_openai,
@@ -15,7 +15,7 @@ from pipelines.parsing import normalize_resume
15
  from pipelines.merge import merge_normalized_records
16
  from pipelines.skills import extract_skills
17
 
18
- # ---- anonymize フォールバック(pipelines/anonymize.py が未実装でも動く) ----
19
  try:
20
  from pipelines.anonymize import anonymize_text, render_anonymized_pdf # type: ignore
21
  except Exception:
@@ -28,7 +28,6 @@ except Exception:
28
  A4 = None
29
 
30
  def anonymize_text(text: str):
31
- # 超簡易:メール/電話っぽい所をマスク。氏名は見出し候補を雑にマスク。
32
  masked = re.sub(r"([A-Za-z0-9._%+-]+)@([A-Za-z0-9.-]+\.[A-Za-z]{2,})", r"***@\2", text)
33
  masked = re.sub(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", "***-****-****", masked)
34
  masked = re.sub(r"(氏名[::]?\s*)(\S+)", r"\1***", masked)
@@ -36,22 +35,21 @@ except Exception:
36
 
37
  def render_anonymized_pdf(text: str) -> bytes:
38
  if canvas is None:
39
- # reportlab が無ければテキストファイルで代替(UIは .pdf 名で返す)
40
  return text.encode("utf-8")
41
  buf = io.BytesIO()
42
  c = canvas.Canvas(buf, pagesize=A4)
43
  width, height = A4
44
- margin = 40
45
- y = height - margin
46
  for line in text.splitlines() or ["(no content)"]:
47
- if y < margin:
48
  c.showPage()
49
- y = height - margin
50
- c.drawString(margin, y, line[:95])
51
  y -= 14
52
  c.save()
53
  return buf.getvalue()
54
- # ---------------------------------------------------------------------------
55
 
56
  from pipelines.scoring import compute_quality_score
57
  from pipelines.storage import persist_to_hf
@@ -60,7 +58,7 @@ from pipelines.utils import detect_filetype, load_doc_text
60
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
61
 
62
 
63
- def _read_bytes_from_path(path: str) -> bytes:
64
  with open(path, "rb") as f:
65
  return f.read()
66
 
@@ -77,10 +75,10 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
77
 
78
  for path in files:
79
  filename = os.path.basename(path)
80
- raw_bytes = _read_bytes_from_path(path)
81
  filetype = detect_filetype(filename, raw_bytes)
82
 
83
- # 1) テキスト抽出:画像/PDFはOpenAI Vision OCR、docx/txtは生文面+OpenAI整形
84
  if filetype in {"pdf", "image"}:
85
  text = extract_text_with_openai(raw_bytes, filename=filename, filetype=filetype)
86
  else:
@@ -89,7 +87,7 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
89
 
90
  raw_texts.append({"filename": filename, "text": text})
91
 
92
- # 2) OpenAIでセクション構造化 ルール正規化
93
  structured = structure_with_openai(text)
94
  normalized = normalize_resume({
95
  "work_experience": structured.get("work_experience_raw", ""),
@@ -104,10 +102,10 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
104
  "normalized": normalized,
105
  })
106
 
107
- # 3) 統合(複数ファイル→1候補者)
108
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
109
 
110
- # 4) スキル抽出(辞書/正規表現)
111
  merged_text = "\n\n".join([r["text"] for r in partial_records])
112
  skills = extract_skills(merged_text, {
113
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
@@ -123,10 +121,10 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
123
  # 6) 品質スコア
124
  score = compute_quality_score(merged_text, merged)
125
 
126
- # 7) 要約(300/100/1文)
127
  summaries = summarize_with_openai(merged_text)
128
 
129
- # 8) 構造化出力
130
  cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
131
  result_json = {
132
  "candidate_id": cid,
@@ -152,10 +150,11 @@ def process_resumes(files: List[str], candidate_id: str = "", additional_notes:
152
  pdf_path=f"candidates/{cid}.anon.pdf",
153
  )
154
 
155
- # UI 向け出力を整形
156
  anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
 
 
157
  out_json_str = json.dumps(result_json, ensure_ascii=False, indent=2)
158
- out_skills_str = json.dumps(skills, ensure_ascii=False, indent=2) # gr.Code で表示
159
  out_score_str = json.dumps(score, ensure_ascii=False, indent=2)
160
  out_commit_str = json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2)
161
 
@@ -179,7 +178,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
179
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
180
  file_count="multiple",
181
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
182
- type="filepath", # 重要:'file' ではなく 'filepath'
183
  )
184
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
185
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
@@ -190,7 +189,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
190
  out_json = gr.Code(label="統合出力 (JSON)")
191
 
192
  with gr.Tab("抽出スキル"):
193
- out_skills = gr.Code(label="スキル一覧 (JSON)") # JSON -> Code に変更
194
 
195
  with gr.Tab("品質スコア"):
196
  out_score = gr.Code(label="品質評価 (JSON)")
@@ -214,5 +213,5 @@ with gr.Blocks(title=APP_TITLE) as demo:
214
  )
215
 
216
  if __name__ == "__main__":
217
- # 環境により localhost へ到達できない場合があるため share=True を既定に
218
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
2
  import io
3
  import json
4
  import hashlib
5
+ import gradio # 一部の前方参照バグ回避
6
  import gradio as gr
7
+ from typing import List
8
 
9
  from pipelines.openai_ingest import (
10
  extract_text_with_openai,
 
15
  from pipelines.merge import merge_normalized_records
16
  from pipelines.skills import extract_skills
17
 
18
+ # --- 匿名化のフォールバック(pipelines/anonymize.py が空/未実装でも動く) ---
19
  try:
20
  from pipelines.anonymize import anonymize_text, render_anonymized_pdf # type: ignore
21
  except Exception:
 
28
  A4 = None
29
 
30
  def anonymize_text(text: str):
 
31
  masked = re.sub(r"([A-Za-z0-9._%+-]+)@([A-Za-z0-9.-]+\.[A-Za-z]{2,})", r"***@\2", text)
32
  masked = re.sub(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", "***-****-****", masked)
33
  masked = re.sub(r"(氏名[::]?\s*)(\S+)", r"\1***", masked)
 
35
 
36
  def render_anonymized_pdf(text: str) -> bytes:
37
  if canvas is None:
 
38
  return text.encode("utf-8")
39
  buf = io.BytesIO()
40
  c = canvas.Canvas(buf, pagesize=A4)
41
  width, height = A4
42
+ m = 40
43
+ y = height - m
44
  for line in text.splitlines() or ["(no content)"]:
45
+ if y < m:
46
  c.showPage()
47
+ y = height - m
48
+ c.drawString(m, y, line[:95])
49
  y -= 14
50
  c.save()
51
  return buf.getvalue()
52
+ # ----------------------------------------------------------------------
53
 
54
  from pipelines.scoring import compute_quality_score
55
  from pipelines.storage import persist_to_hf
 
58
  APP_TITLE = "候補者インテーク & レジュメ標準化(OpenAI版)"
59
 
60
 
61
+ def _read_bytes(path: str) -> bytes:
62
  with open(path, "rb") as f:
63
  return f.read()
64
 
 
75
 
76
  for path in files:
77
  filename = os.path.basename(path)
78
+ raw_bytes = _read_bytes(path)
79
  filetype = detect_filetype(filename, raw_bytes)
80
 
81
+ # 1) 抽出
82
  if filetype in {"pdf", "image"}:
83
  text = extract_text_with_openai(raw_bytes, filename=filename, filetype=filetype)
84
  else:
 
87
 
88
  raw_texts.append({"filename": filename, "text": text})
89
 
90
+ # 2) 構造化→正規化
91
  structured = structure_with_openai(text)
92
  normalized = normalize_resume({
93
  "work_experience": structured.get("work_experience_raw", ""),
 
102
  "normalized": normalized,
103
  })
104
 
105
+ # 3) 統合
106
  merged = merge_normalized_records([r["normalized"] for r in partial_records])
107
 
108
+ # 4) スキル抽出
109
  merged_text = "\n\n".join([r["text"] for r in partial_records])
110
  skills = extract_skills(merged_text, {
111
  "work_experience": merged.get("raw_sections", {}).get("work_experience", ""),
 
121
  # 6) 品質スコア
122
  score = compute_quality_score(merged_text, merged)
123
 
124
+ # 7) 要約
125
  summaries = summarize_with_openai(merged_text)
126
 
127
+ # 8) 出力まとめ
128
  cid = candidate_id or hashlib.sha256(merged_text.encode("utf-8")).hexdigest()[:16]
129
  result_json = {
130
  "candidate_id": cid,
 
150
  pdf_path=f"candidates/{cid}.anon.pdf",
151
  )
152
 
 
153
  anon_pdf = (f"{cid}.anon.pdf", anon_pdf_bytes)
154
+
155
+ # ★ UI用:すべて文字列化して返す(gr.JSON を使わない)
156
  out_json_str = json.dumps(result_json, ensure_ascii=False, indent=2)
157
+ out_skills_str = json.dumps(skills, ensure_ascii=False, indent=2)
158
  out_score_str = json.dumps(score, ensure_ascii=False, indent=2)
159
  out_commit_str = json.dumps(commit_info or {"status": "skipped (DATASET_REPO not set)"}, ensure_ascii=False, indent=2)
160
 
 
178
  label="レジュメ類 (PDF/画像/Word/テキスト) 複数可",
179
  file_count="multiple",
180
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".docx", ".txt"],
181
+ type="filepath", # 重要:'file' ではなく 'filepath'
182
  )
183
  candidate_id = gr.Textbox(label="候補者ID(任意。未入力なら自動生成)")
184
  notes = gr.Textbox(label="補足メモ(任意)", lines=3)
 
189
  out_json = gr.Code(label="統合出力 (JSON)")
190
 
191
  with gr.Tab("抽出スキル"):
192
+ out_skills = gr.Code(label="スキル一覧 (JSON)") # gr.JSON を避ける
193
 
194
  with gr.Tab("品質スコア"):
195
  out_score = gr.Code(label="品質評価 (JSON)")
 
213
  )
214
 
215
  if __name__ == "__main__":
216
+ # 到達性のため share=True 推奨
217
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)