Spaces:
Paused
Paused
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| import json | |
| from deepdoc.parser.resume.entities import degrees, regions, industries | |
| FIELDS = [ | |
| "address STRING", | |
| "annual_salary int", | |
| "annual_salary_from int", | |
| "annual_salary_to int", | |
| "birth STRING", | |
| "card STRING", | |
| "certificate_obj string", | |
| "city STRING", | |
| "corporation_id int", | |
| "corporation_name STRING", | |
| "corporation_type STRING", | |
| "degree STRING", | |
| "discipline_name STRING", | |
| "education_obj string", | |
| "email STRING", | |
| "expect_annual_salary int", | |
| "expect_city_names string", | |
| "expect_industry_name STRING", | |
| "expect_position_name STRING", | |
| "expect_salary_from int", | |
| "expect_salary_to int", | |
| "expect_type STRING", | |
| "gender STRING", | |
| "industry_name STRING", | |
| "industry_names STRING", | |
| "is_deleted STRING", | |
| "is_fertility STRING", | |
| "is_house STRING", | |
| "is_management_experience STRING", | |
| "is_marital STRING", | |
| "is_oversea STRING", | |
| "language_obj string", | |
| "name STRING", | |
| "nation STRING", | |
| "phone STRING", | |
| "political_status STRING", | |
| "position_name STRING", | |
| "project_obj string", | |
| "responsibilities string", | |
| "salary_month int", | |
| "scale STRING", | |
| "school_name STRING", | |
| "self_remark string", | |
| "skill_obj string", | |
| "title_name STRING", | |
| "tob_resume_id STRING", | |
| "updated_at Timestamp", | |
| "wechat STRING", | |
| "work_obj string", | |
| "work_experience int", | |
| "work_start_time BIGINT" | |
| ] | |
| def refactor(df): | |
| def deal_obj(obj, k, kk): | |
| if not isinstance(obj, type({})): | |
| return "" | |
| obj = obj.get(k, {}) | |
| if not isinstance(obj, type({})): | |
| return "" | |
| return obj.get(kk, "") | |
| def loadjson(line): | |
| try: | |
| return json.loads(line) | |
| except Exception as e: | |
| pass | |
| return {} | |
| df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) | |
| df.fillna("", inplace=True) | |
| clms = ["tob_resume_id", "updated_at"] | |
| def extract(nms, cc=None): | |
| nonlocal clms | |
| clms.extend(nms) | |
| for c in nms: | |
| if cc: | |
| df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) | |
| else: | |
| df[c] = df["obj"].map( | |
| lambda x: json.dumps( | |
| x.get( | |
| c, | |
| {}), | |
| ensure_ascii=False) if isinstance( | |
| x, | |
| type( | |
| {})) and ( | |
| isinstance( | |
| x.get(c), | |
| type( | |
| {})) or not x.get(c)) else str(x).replace( | |
| "None", | |
| "")) | |
| extract(["education", "work", "certificate", "project", "language", | |
| "skill"]) | |
| extract(["wechat", "phone", "is_deleted", | |
| "name", "tel", "email"], "contact") | |
| extract(["nation", "expect_industry_name", "salary_month", | |
| "industry_ids", "is_house", "birth", "annual_salary_from", | |
| "annual_salary_to", "card", | |
| "expect_salary_to", "expect_salary_from", | |
| "expect_position_name", "gender", "city", | |
| "is_fertility", "expect_city_names", | |
| "political_status", "title_name", "expect_annual_salary", | |
| "industry_name", "address", "position_name", "school_name", | |
| "corporation_id", | |
| "is_oversea", "responsibilities", | |
| "work_start_time", "degree", "management_experience", | |
| "expect_type", "corporation_type", "scale", "corporation_name", | |
| "self_remark", "annual_salary", "work_experience", | |
| "discipline_name", "marital", "updated_at"], "basic") | |
| df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) | |
| df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) | |
| df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in | |
| str(x).split(",")])) | |
| clms.append("industry_names") | |
| def arr2str(a): | |
| if not a: | |
| return "" | |
| if isinstance(a, list): | |
| a = " ".join([str(i) for i in a]) | |
| return str(a).replace(",", " ") | |
| df["expect_industry_name"] = df["expect_industry_name"].map( | |
| lambda x: arr2str(x)) | |
| df["gender"] = df["gender"].map( | |
| lambda x: "男" if x == 'M' else ( | |
| "女" if x == 'F' else "")) | |
| for c in ["is_fertility", "is_oversea", "is_house", | |
| "management_experience", "marital"]: | |
| df[c] = df[c].map( | |
| lambda x: '是' if x == 'Y' else ( | |
| '否' if x == 'N' else "")) | |
| df["is_management_experience"] = df["management_experience"] | |
| df["is_marital"] = df["marital"] | |
| clms.extend(["is_management_experience", "is_marital"]) | |
| df.fillna("", inplace=True) | |
| for i in range(len(df)): | |
| if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): | |
| df.loc[i, "phone"] = df.loc[i, "tel"].strip() | |
| for n in ["industry_ids", "management_experience", "marital", "tel"]: | |
| for i in range(len(clms)): | |
| if clms[i] == n: | |
| del clms[i] | |
| break | |
| clms = list(set(clms)) | |
| df = df.reindex(sorted(clms), axis=1) | |
| #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL") | |
| for c in clms: | |
| df[c] = df[c].map( | |
| lambda s: str(s).replace( | |
| "\t", | |
| " ").replace( | |
| "\n", | |
| "\\n").replace( | |
| "\r", | |
| "\\n")) | |
| # print(df.values.tolist()) | |
| return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0])) | |