Spaces:
Sleeping
Sleeping
File size: 7,379 Bytes
478dec6 f3bdba1 478dec6 f3bdba1 478dec6 f3bdba1 478dec6 f3bdba1 478dec6 f3bdba1 478dec6 4324a46 f3bdba1 478dec6 f3bdba1 478dec6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | import time
from uuid import uuid4
from pydantic import Field
from typing import TypedDict, List, Dict, Union
from langchain_core.prompts import ChatPromptTemplate
from services.models.data_model import AIProfile, RawProfile, Profiles
# from services.base.BaseGenerator import BaseAIGenerator, MetadataObservability
from services.base.BaseGenerator_v2 import BaseAIGenerator, MetadataObservability
from services.llms.LLM import model_5mini, model_4o_2
from utils.decorator import trace_runtime
from utils.logger import get_logger
from fastapi import HTTPException, status
logger = get_logger("profile extraction")
from sqlalchemy.ext.asyncio import AsyncSession
from externals.databases.pg_crud import (
get_file_by_filename,
get_profile_by_filename,
mark_file_extracted,
create_profile,
)
from externals.databases.pg_models import CVUser, CVProfile
from utils.logger import get_logger
from externals.storages.azure_blob import download_blob_by_filename
from services.extractor.pdf import extract_text_from_pdf_bytes
logger = get_logger("knowledge.extract")
class KnowledgeExtractService:
def __init__(self, db: AsyncSession, user: CVUser):
self.db = db
self.user = user
@trace_runtime
async def extract(self, filename: str) -> CVProfile:
# 1️⃣ Ambil metadata file
file = await get_file_by_filename(self.db,
filename=filename,
user_id=self.user.user_id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File '{filename}' not found",
)
if file.is_extracted:
logger.info(f"ℹ️ File already extracted: {filename}")
existing = await get_profile_by_filename(self.db, filename=filename, current_user=self.user)
if existing:
return existing
# 2️⃣ Download PDF
pdf_bytes: bytes = await download_blob_by_filename(
filename=file.filename,
tenant_id=self.user.tenant_id,
user_id=self.user.user_id,
)
# 3️⃣ Extract text
text = await extract_text_from_pdf_bytes(pdf_bytes)
if not text.strip():
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="PDF contains no extractable text",
)
# 4️⃣ Build RawProfile
raw_profile = RawProfile(
filename=file.filename,
content_type=file.file_type,
content=text,
profile_id=str(uuid4())
)
# 5️⃣ Extract with AI
ai_profile: AIProfile = await self._extract_with_ai(raw_profile)
if not ai_profile:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="AI extraction failed",
)
# 6️⃣ INSERT profile ke cv_profile
profile = await create_profile(
db=self.db,
# user_id=self.user.user_id,
# tenant_id=self.user.tenant_id,
filename=file.filename,
file_id=file.file_id,
profile=ai_profile,
)
# 7️⃣ UPDATE cv_file.is_extracted
await mark_file_extracted(
db=self.db,
file_id=file.file_id,
)
await self.db.commit()
logger.info(
"✅ Profile extracted & persisted",
extra={"context": {
"filename": filename,
"profile_id": profile.profile_id,
}},
)
return profile
@trace_runtime
async def _extract_with_ai(self, raw_profile: RawProfile) -> AIProfile:
"""
Extract structured profile from CV content
"""
try:
extract_one_profile_prompt = """
You are an intelligent information extraction assistant.
Your task is to read the following Curriculum Vitae (CV) text and extract structured information according to the expected output below.
----------------------------
candidate's curriculum vitae:
{cv}
----------------------------
**Expected Output**:
Follow the data schema from AIProfile
**Instructions**:
1. Read the provided CV and extract information needed based on expected output.
2. Be Careful when to extract information for gpa_edu_1, univ_edu_1, major_edu_1 or gpa_edu_2, univ_edu_2, major_edu_2 or gpa_edu_3, univ_edu_3, major_edu_3
..._edu_1 is only for bachelor/undergraduate/sarjana degree
..._edu_2 is only for master/postgraduate degree
..._edu_3 is only for doctor/phd degree
3. Reformat the extracted info using correct word spacing.
4. Do not verbose, just return the final answer.
""".strip()
prompt = ChatPromptTemplate.from_template(extract_one_profile_prompt)
input_llm = {
"cv": raw_profile.get("content")
}
# llm = model_4o_2.with_structured_output(AIProfile)
llm = model_4o_2.with_structured_output(AIProfile)
gen_ai = BaseAIGenerator(
task_name="extract profile",
prompt=prompt,
input_llm=input_llm,
llm=llm,
metadata_observability=MetadataObservability(
fullname=self.user.full_name,
task_id=str(uuid4()),
agent=self.extract.__name__,
user_id=self.user.email,
)
)
result = await gen_ai.agenerate()
# result = asyncio.run(gen_ai.agenerate())
if result:
logger.info(f"✅ extract one profile success")
return result
else:
logger.error(f"""❌ extract one profile failed, something went wrong for profile_id {raw_profile.get("profile_id")}""")
return {}
except Exception as E:
logger.error(f"""❌ extract one profile error for profile_id {raw_profile.get("profile_id")}, {E}""")
return {}
# @trace_runtime
# async def extract_bulk_profile(self, raw_profiles: List[RawProfile]) -> Profiles:
# try:
# profiles = []
# failed_id = []
# for _, cv_text in enumerate(raw_profiles):
# profile = await self.extract(cv_text.get("content"))
# time.sleep(2)
# if profile:
# logger.info(f"✅ extract bulk profile success [{_+1}/{len(raw_profiles)}]")
# profiles.append(profile)
# else:
# logger.info(f"""❌ extract bulk profile error for profile {cv_text.get("profile_id")}, [{_+1}/{len(raw_profiles)}]""")
# profiles[cv_text.get("profile_id")] = profile
# failed_id.append(cv_text.get("profile_id"))
# bulk_profile = Profiles(
# profiles=profiles
# )
# return bulk_profile
# except Exception as E:
# logger.error(f"❌ extract bulk profile error for profile_id {failed_id}, {E}")
# return Profiles(
# profiles=[]
# )
|