Corin1998 commited on
Commit
1149a64
·
verified ·
1 Parent(s): d0bc04c

Create marge.py

Browse files
Files changed (1) hide show
  1. pipelines/marge.py +56 -0
pipelines/marge.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import re
3
+
4
+ def _period_key(item_text: str) -> str:
5
+ m = re.search(r"(\d{4}[./年]\d{1,2})\s*[-〜~]\s*(\d{4}[./年]?\d{0,2}|現在|至今)?", item_text)
6
+ return m.group(0) if m else item_text[:50]
7
+
8
+
9
+ def merge_normalized_records(records: List[Dict]) -> Dict:
10
+ merged = {"work_experience": [], "education": [], "certifications": [], "skills": [], "raw_sections": {}}
11
+
12
+ seen_we = set()
13
+ seen_edu = set()
14
+ seen_cert = set()
15
+ skill_set = set()
16
+
17
+ for r in records:
18
+ # work
19
+ for w in r.get("work_experience", []):
20
+ key = _period_key(w.get("text", "")) + "|" + w.get("text", "")[:80]
21
+ if key not in seen_we:
22
+ seen_we.add(key)
23
+ merged["work_experience"].append(w)
24
+ # edu
25
+ for e in r.get("education", []):
26
+ k = e.get("text", "")
27
+ if k and k not in seen_edu:
28
+ seen_edu.add(k)
29
+ merged["education"].append(e)
30
+ # cert
31
+ for c in r.get("certifications", []):
32
+ k = c.get("text", "")
33
+ if k and k not in seen_cert:
34
+ seen_cert.add(k)
35
+ merged["certifications"].append(c)
36
+ # skills
37
+ for s in r.get("skills", []):
38
+ if s:
39
+ skill_set.add(s)
40
+ # raw sections(最後のものを採用しつつ連結)
41
+ for k, v in r.get("raw_sections", {}).items():
42
+ merged["raw_sections"][k] = (merged["raw_sections"].get(k, "") + "\n" + v).strip()
43
+
44
+ # ソート:期間っぽい語があるものを先頭、あとは原順
45
+ def _sort_key(w):
46
+ m = re.search(r"(\d{4})([./年])(\d{1,2})", w.get("period", "") or w.get("text", ""))
47
+ if m:
48
+ try:
49
+ return (-(int(m.group(1)) * 100 + int(m.group(3))), 0)
50
+ except Exception:
51
+ return (0, 1)
52
+ return (0, 1)
53
+
54
+ merged["work_experience"].sort(key=_sort_key)
55
+ merged["skills"] = sorted(skill_set)
56
+ return merged