Corin1998 commited on
Commit
819818d
·
verified ·
1 Parent(s): c8b6d3b

Create parsing.py

Browse files
Files changed (1) hide show
  1. pipelines/parsing.py +41 -0
pipelines/parsing.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List
3
+
4
+ SECTION_HEADERS = [
5
+ ("work_experience", ["職歴", "職務経歴", "業務経験", "Work Experience", "Experience"]),
6
+ ("education", ["学歴", "Education"]),
7
+ ("certifications", ["資格", "認定", "Certificates", "Certifications"]),
8
+ ("skills", ["スキル", "Skills"]),
9
+ ]
10
+
11
+
12
+ def normalize_resume(sections_dict: Dict[str, str]) -> Dict[str, List[Dict]]:
13
+ # sections_dict は {work_experience, education, certifications, skills, raw_sections?}
14
+ work_items = []
15
+ raw_work = sections_dict.get("work_experience", "") or sections_dict.get("work_experience_raw", "")
16
+ for line in raw_work.splitlines():
17
+ m = re.search(r"(\d{4}[./年]\d{1,2})\s*[-〜~]\s*(\d{4}[./年]?\d{0,2}|現在|至今)?", line)
18
+ if m:
19
+ work_items.append({"period": m.group(0), "text": line.strip()})
20
+
21
+ edu_items = [{"text": l.strip()} for l in (sections_dict.get("education", "") or sections_dict.get("education_raw", "")).splitlines() if l.strip()]
22
+ cert_items = [{"text": l.strip()} for l in (sections_dict.get("certifications", "") or sections_dict.get("certifications_raw", "")).splitlines() if l.strip()]
23
+
24
+ # skills はカンマ/改行区切り or list を許容
25
+ skills_raw = sections_dict.get("skills", "")
26
+ if isinstance(skills_raw, list):
27
+ skill_items = skills_raw
28
+ else:
29
+ skill_items = [s.strip() for s in re.split(r"[、,\n]\s*", skills_raw) if s.strip()]
30
+
31
+ return {
32
+ "work_experience": work_items,
33
+ "education": edu_items,
34
+ "certifications": cert_items,
35
+ "skills": skill_items,
36
+ "raw_sections": {
37
+ "work_experience": raw_work,
38
+ "education": sections_dict.get("education", "") or sections_dict.get("education_raw", ""),
39
+ "certifications": sections_dict.get("certifications", "") or sections_dict.get("certifications_raw", ""),
40
+ },
41
+ }