api-ta / extractor_llm.py
m00913563
add new return format
a0e8e60
from openai import OpenAI
from models import CVExtracted
client = OpenAI()
def predict(input):
fewshot = """
You are tasked to parse Curiculum Vitae files into JSON such format below:
{
“name”: string,
“skills”: []string,
"links": []string,
“achievements”: []string,
“experiences”: []{
“start”: string,
“end”: string,
“designation”: string,
“company”: string,
“experience_description”: string
},
“educations”: []{
“start”: string,
“end”: string,
“major”: string,
“campus”: string,
“gpa”: integer
}
}
below is the example:
{
"name": "Faiq Bil Haq Izzuddin",
"skills": [
"Python (Matplotlib, Pandas)",
"Seaborn,",
"Sklearn",
"TensorFlow",
"Keras,",
"NetworkX",
"Java",
"Excel (Linkedin Certification)",
"Advanced SQL by Hackerrank (Certificate)",
"PowerBI",
"Tableau",
"Metabase",
"Problem Solving",
"Basic by Hackerrank (Certificate)",
"SpreadSheet (Google Sheet)",
"NoSQL",
"Athena",
"GCP BigQuery",
"Deep Learning",
"ETL",
"Computer Vision",
"NLP",
"OCR",
"MLOps",
"Hadoop",
"PySpark."
],
"links": [
"linkedin.com/in/faiz-b-h/",
"github.com/mfaizbh22",
"faizzz.vercel.app",
"kaggle.com/mfaizb"
],
"experiences": [
{
"start": '2024-08-28T00:00:00.000Z',
"end": null,
"designation": "Data Analyst Engineer",
"company": "Professional Huawei Technology Co. Ltd.",
"experience_description": Successfully managed deployment of inhouse data warehouse and analytics automation POC using Hadoop and Spark, potential to improve ROI up to millions dollars quarterly. Developed an anti - fraud retention ML model, saved up to hundred millions rupiah monthly and was recognized by CTO. Produce numerous data pipelines and reports using Hadoop and PowerBi with automatic issue resolution."
},
{
"start": '22023-12-01T00:00:00.000Z',
"end": "2024-03-01T00:00:00.000Z",
"designation": "Data Analyst Intern",
"company": "Kitalulus",
"experience_description": "Analyzed user behavior, successfully prevented over 10. 000 applicants from fraud and improved recommendation system. Evaluated and revamped ETL layer, 20 dashboards, 100 charts query and successfully achieved 90 percent cost reduction. Scraped and analyzed data, completed over 60 requests, included investor âĢĻ s report along with others"
},
{
"start": "2023-08-01T00:00:00.000Z",
"end": "2023-12-31T00:00:00.000Z",
"designation": "Course Assistant, IF3240",
"company": "STEI ITB",
"experience_description": "Assisted 4 teams with up to 20 students from batch 2021 to develop an information system Helped up to 20 students to analyze problem and implement system information solution from real companies problem"
},
{
"start": "2022-06-01T00:00:00.000Z",
"end": "2023-12-01T00:00:00.000Z",
"designation": "Backend Engineer Intern",
"company": "Kitalulus",
"experience_description": "Decreased manpower for warehouse management by 10 percent per month. Decreased cost for supplier and production operational cost up to Rp. 2. 5 M per week. Responsible for overwriting and optimizing the queries and databases table in postgresql by critical times."
}
],
"educations": [
{
"start": "2020-08-20T00:00:00.000Z",
"end": "2025-08-25T00:00:00.000Z",
"major": "B. Eng, Informatic Engineering",
"campus": "Institut Teknologi Bandung ",
"gpa": 341
}
]
}
\n
If the document OCR read or extraction is null, please return with empty structue.
"""
model_parameters = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{"role": "system", "content": fewshot},
{"role": "user", "content": input},
],
response_format=CVExtracted,
)
return model_parameters.choices[0].message.parsed