Corin1998 commited on
Commit
150af7a
·
verified ·
1 Parent(s): d8bdfe7

Create pipelines/skills.py

Browse files
Files changed (1) hide show
  1. pipelines/skills.py +45 -0
pipelines/skills.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List
3
+
4
+ SKILL_LEXICON = [
5
+ # 言語
6
+ "Python", "C++", "Java", "Go", "Rust", "JavaScript", "TypeScript", "SQL", "R",
7
+ # フレームワーク/ツール
8
+ "PyTorch", "TensorFlow", "Keras", "scikit-learn", "Hugging Face", "Transformers",
9
+ "FastAPI", "Django", "Flask", "React", "Vue", "Next.js", "Node.js",
10
+ # データ基盤
11
+ "Spark", "Hadoop", "Airflow", "dbt", "Kafka",
12
+ # クラウド
13
+ "AWS", "GCP", "Azure", "Docker", "Kubernetes",
14
+ # 分析・BI
15
+ "Tableau", "Power BI", "Looker",
16
+ # その他
17
+ "Git", "Linux", "Terraform", "OpenAPI",
18
+ ]
19
+
20
+ NAME_HINTS = ["氏名", "Name"]
21
+
22
+
23
+ def _regex_ner_contacts(text: str) -> Dict[str, List[str]]:
24
+ emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
25
+ phones = re.findall(r"(?:\+?\d{1,3}[ -]?)?(?:\(\d{2,4}\)[ -]?)?\d{2,4}[ -]?\d{2,4}[ -]?\d{3,4}", text)
26
+ return {"EMAIL": list(set(emails)), "PHONE": list(set(phones))}
27
+
28
+
29
+ def extract_skills(text: str, sections: Dict[str, str]) -> Dict:
30
+ contacts = _regex_ner_contacts(text)
31
+ name_lines = []
32
+ for hint in NAME_HINTS:
33
+ name_lines += [l.strip() for l in text.splitlines() if hint in l][:3]
34
+
35
+ found = []
36
+ text_lower = text.lower()
37
+ for s in SKILL_LEXICON:
38
+ if s.lower() in text_lower:
39
+ found.append(s)
40
+
41
+ return {
42
+ "skills": sorted(list(set(found))),
43
+ "contacts": contacts,
44
+ "name_candidates": name_lines,
45
+ }