File size: 7,379 Bytes
478dec6
 
 
 
 
 
 
f3bdba1
 
478dec6
 
 
 
 
 
 
 
 
 
f3bdba1
478dec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3bdba1
 
 
478dec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3bdba1
 
 
 
 
 
478dec6
 
 
 
 
 
 
 
 
f3bdba1
478dec6
 
 
 
4324a46
f3bdba1
478dec6
f3bdba1
478dec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import time
from uuid import uuid4
from pydantic import Field
from typing import TypedDict, List, Dict, Union
from langchain_core.prompts import ChatPromptTemplate

from services.models.data_model import AIProfile, RawProfile, Profiles
# from services.base.BaseGenerator import BaseAIGenerator, MetadataObservability
from services.base.BaseGenerator_v2 import BaseAIGenerator, MetadataObservability
from services.llms.LLM import model_5mini, model_4o_2
from utils.decorator import trace_runtime
from utils.logger import get_logger
from fastapi import HTTPException, status
logger = get_logger("profile extraction")


from sqlalchemy.ext.asyncio import AsyncSession
from externals.databases.pg_crud import (
    get_file_by_filename,
    get_profile_by_filename,
    mark_file_extracted,
    create_profile,
)
from externals.databases.pg_models import CVUser, CVProfile
from utils.logger import get_logger
from externals.storages.azure_blob import download_blob_by_filename
from services.extractor.pdf import extract_text_from_pdf_bytes
logger = get_logger("knowledge.extract")


class KnowledgeExtractService:

    def __init__(self, db: AsyncSession, user: CVUser):
        self.db = db
        self.user = user

    @trace_runtime
    async def extract(self, filename: str) -> CVProfile:
        # 1️⃣ Ambil metadata file
        file = await get_file_by_filename(self.db, 
                                          filename=filename,
                                          user_id=self.user.user_id)

        if not file:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail=f"File '{filename}' not found",
            )

        if file.is_extracted:
            logger.info(f"ℹ️ File already extracted: {filename}")
            existing = await get_profile_by_filename(self.db, filename=filename, current_user=self.user)
            if existing:
                return existing

        # 2️⃣ Download PDF
        pdf_bytes: bytes = await download_blob_by_filename(
            filename=file.filename,
            tenant_id=self.user.tenant_id,
            user_id=self.user.user_id,
        )

        # 3️⃣ Extract text
        text = await extract_text_from_pdf_bytes(pdf_bytes)

        if not text.strip():
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="PDF contains no extractable text",
            )

        # 4️⃣ Build RawProfile
        raw_profile = RawProfile(
            filename=file.filename,
            content_type=file.file_type,
            content=text,
            profile_id=str(uuid4())
        )

        # 5️⃣ Extract with AI
        ai_profile: AIProfile = await self._extract_with_ai(raw_profile)

        if not ai_profile:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail="AI extraction failed",
            )

        # 6️⃣ INSERT profile ke cv_profile
        profile = await create_profile(
            db=self.db,
            # user_id=self.user.user_id,
            # tenant_id=self.user.tenant_id,
            filename=file.filename,
            file_id=file.file_id,
            profile=ai_profile,
        )

        # 7️⃣ UPDATE cv_file.is_extracted
        await mark_file_extracted(
            db=self.db,
            file_id=file.file_id,
        )

        await self.db.commit()

        logger.info(
            "✅ Profile extracted & persisted",
            extra={"context": {
                "filename": filename,
                "profile_id": profile.profile_id,
            }},
        )

        return profile


    @trace_runtime
    async def _extract_with_ai(self, raw_profile: RawProfile) -> AIProfile:
        """
        Extract structured profile from CV content
        """

        try:
            extract_one_profile_prompt = """
    You are an intelligent information extraction assistant.
    Your task is to read the following Curriculum Vitae (CV) text and extract structured information according to the expected output below.

    ----------------------------
    candidate's curriculum vitae:
    {cv}
    ----------------------------


    **Expected Output**:
    Follow the data schema from AIProfile


    **Instructions**:
    1. Read the provided CV and extract information needed based on expected output.
    2. Be Careful when to extract information for gpa_edu_1, univ_edu_1, major_edu_1 or gpa_edu_2, univ_edu_2, major_edu_2 or gpa_edu_3, univ_edu_3, major_edu_3
        ..._edu_1 is only for bachelor/undergraduate/sarjana degree
        ..._edu_2 is only for master/postgraduate degree
        ..._edu_3 is only for doctor/phd degree
    3. Reformat the extracted info using correct word spacing.
    4. Do not verbose, just return the final answer.
    """.strip()
            prompt = ChatPromptTemplate.from_template(extract_one_profile_prompt)
            input_llm = {
                "cv": raw_profile.get("content")
            }
            # llm = model_4o_2.with_structured_output(AIProfile)
            llm = model_4o_2.with_structured_output(AIProfile)

            gen_ai = BaseAIGenerator(
                task_name="extract profile",
                prompt=prompt,
                input_llm=input_llm,
                llm=llm,
                metadata_observability=MetadataObservability(
                    fullname=self.user.full_name,
                    task_id=str(uuid4()),
                    agent=self.extract.__name__,
                    user_id=self.user.email,
                )
            )
            result = await gen_ai.agenerate()
            # result = asyncio.run(gen_ai.agenerate())

            if result:
                logger.info(f"✅ extract one profile success")
                return result
            else:
                logger.error(f"""❌ extract one profile failed, something went wrong for profile_id {raw_profile.get("profile_id")}""")
                return {}
        except Exception as E:
            logger.error(f"""❌ extract one profile error for profile_id {raw_profile.get("profile_id")}, {E}""")
            return {}



    # @trace_runtime
    # async def extract_bulk_profile(self, raw_profiles: List[RawProfile]) -> Profiles:
    #     try:
    #         profiles = []
    #         failed_id = []
    #         for _, cv_text in enumerate(raw_profiles):
    #             profile = await self.extract(cv_text.get("content"))
    #             time.sleep(2)
    #             if profile:
    #                 logger.info(f"✅ extract bulk profile success [{_+1}/{len(raw_profiles)}]")
    #                 profiles.append(profile)
    #             else:
    #                 logger.info(f"""❌ extract bulk profile error for profile {cv_text.get("profile_id")}, [{_+1}/{len(raw_profiles)}]""")
    #                 profiles[cv_text.get("profile_id")] = profile
    #                 failed_id.append(cv_text.get("profile_id"))
            
    #         bulk_profile = Profiles(
    #             profiles=profiles
    #         )
    #         return bulk_profile
    #     except Exception as E:
    #         logger.error(f"❌ extract bulk profile error for profile_id {failed_id}, {E}")
    #         return Profiles(
    #             profiles=[]
    #         )