Spaces:
Sleeping
Sleeping
File size: 9,032 Bytes
478dec6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | import asyncio
# import os, glob
import uuid
import time
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
from typing import AsyncIterable, List, Optional, Dict, Union
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.callbacks import AsyncCallbackHandler
# from fastapi.middleware.cors import CORSMiddleware
# from fastapi.responses import StreamingResponse
# from langchain.chat_models import ChatOpenAI
# from pydantic import BaseModel
# from langchain_google_genai import ChatGoogleGenerativeAI
from services.llms.LLM import model_5mini, model_4o_2
from services.agents.LLMAgent import LLMAgent
from models.data_model import OutProfile, Profile
from services.prompts.profile_extraction import extract_one_profile
from utils.utils import pdf_reader, ingest_one_profile, ingest_bulk_profile, retrieve_profile, pretty_profiles
# from concurrent.futures import ThreadPoolExecutor, as_completed
prompt_template_filename = "AutograderPrompt.md"
prompt_autograder = open(f"src/prompts/{prompt_template_filename}", "rb").read().decode('utf-8')
class AutograderAgent(LLMAgent):
def __init__(self, model=model_4o_2):
super().__init__(model)
self.agent_name = "AutograderAgent"
self.prompt_template = prompt_autograder
self.prompt_extract_one_profile = extract_one_profile
async def generate(self, user_profile: str) -> AsyncIterable[str]:
"""Generates a response from messages using the model's astream method."""
self.callback = AsyncCallbackHandler()
self.callbacks = [self.callback]
input = [
HumanMessage(content=self.prompt_template.format(user_profile=user_profile)),
]
try:
async for token in self.model.astream(input=input, callbacks=self.callbacks):
yield token.content
except Exception as e:
print(f"Caught exception: {e}")
async def generate_one(self, file_path:str) -> Optional[OutProfile]:
"Generate extracted profile from a CV (curriculum vitae)"
try:
llm = self.model.with_structured_output(OutProfile)
cv = await pdf_reader(file_path) # get_pdf(path)
# extract_one_profile = extract_one_profile.format(cv=cv)
chain = self.prompt_extract_one_profile | llm
input_chain = {
"cv":cv
}
# profile = chain.invoke(input_chain, config=None)
profile = await chain.ainvoke(input_chain, config=None)
return profile
except Exception as E:
print(f"Failed to generate one profile for {file_path} due to error, {E}")
raise NotImplementedError(f"Failed to generate one profile for {file_path} due to error, {E}")
# async def generate_bulk(self, folder_path:str, export_csv:bool=False) -> Optional[List[OutProfile]]:
# "Generate extracted profile from a CV (curriculum vitae)"
# try:
# st = time.time()
# llm = self.model.with_structured_output(OutProfile)
# files_path = glob.glob(f"{folder_path}/*.pdf")
# profiles = []
# n_files = len(files_path)
# for i, file_path in enumerate(files_path):
# cv = await pdf_reader(file_path) # get_pdf(path)
# chain = self.prompt_extract_one_profile | llm
# input_chain = {
# "cv":cv
# }
# profile = await chain.ainvoke(input_chain, config=None)
# profiles.append(profile)
# print(f"[{i+1}/{n_files}] profile extracted β
")
# print(f"β
Finish in {(time.time() - st)//60} min, {(time.time() - st)%60} sec")
# return profiles
# except Exception as E:
# print(f"Failed to generate one profile for {file_path} due to error, {E}")
# raise NotImplementedError(f"Failed to generate one profile for {file_path} due to error, {E}")
# async def generate_bulk(self, folder_path:str, export_csv:bool=False) -> Optional[List[OutProfile]]:
# "Generate extracted profile from a CV (curriculum vitae)"
# try:
# st = time.time()
# llm = self.model.with_structured_output(OutProfile)
# files_path = glob.glob(f"{folder_path}/*.pdf")
# profiles = []
# n_files = len(files_path)
# for i, file_path in enumerate(files_path):
# cv = await pdf_reader(file_path) # get_pdf(path)
# chain = self.prompt_extract_one_profile | llm
# input_chain = {
# "cv":cv
# }
# profile = await chain.ainvoke(input_chain, config=None)
# profiles.append(profile)
# print(f"[{i+1}/{n_files}] profile extracted β
")
# print(f"β
Finish in {(time.time() - st)//60} min, {(time.time() - st)%60} sec")
# return profiles
# except Exception as E:
# print(f"Failed to generate one profile for {file_path} due to error, {E}")
# raise NotImplementedError(f"Failed to generate one profile for {file_path} due to error, {E}")
# not using threadpool
# async def generate_bulk(self, pdfs:List, export_csv:bool=False) -> Optional[List[OutProfile]]:
# "Generate extracted profile from a CV (curriculum vitae)"
# try:
# st = time.time()
# llm = self.model.with_structured_output(OutProfile)
# profiles = []
# n_files = len(pdfs)
# for i, file_path in enumerate(pdfs):
# print(f"Reading file [{i+1}/{n_files}]")
# cv = await pdf_reader(file_path) # get_pdf(path)
# chain = self.prompt_extract_one_profile | llm
# input_chain = {
# "cv":cv
# }
# profile = await chain.ainvoke(input_chain, config=None)
# profiles.append(profile)
# print(f"[{i+1}/{n_files}] profile extracted β
")
# print(f"β
Finish in {(time.time() - st)//60} min, {(time.time() - st)%60} sec")
# return profiles
# except Exception as E:
# print(f"Failed to generate one profile for {file_path} due to error, {E}")
# raise NotImplementedError(f"Failed to generate one profile for {file_path} due to error, {E}")
async def _helper_generate_one(self, file_path):
st = time.time()
cv = await pdf_reader(file_path) # get_pdf(path)
llm = self.model.with_structured_output(OutProfile)
chain = self.prompt_extract_one_profile | llm
input_chain = {
"cv":cv
}
profile = await chain.ainvoke(input_chain, config=None)
rt = time.time() - st
print(f"Runtime extract one profile: {round(rt,2)}")
return profile
async def generate_bulk(self, pdfs:List, export_csv:bool=False) -> Optional[List[OutProfile]]:
"Generate extracted profile from a CV (curriculum vitae)"
try:
st = time.time()
profiles = []
n_files = len(pdfs)
tasks = []
for i, file_path in enumerate(pdfs):
print(f"Reading file [{i+1}/{n_files}]")
task = asyncio.create_task(self._helper_generate_one(file_path))
tasks.append(task)
print(f"[{i+1}/{n_files}] profile extracted β
")
profiles = await asyncio.gather(*tasks)
print(f"β
Finish in {(time.time() - st)//60} min, {(time.time() - st)%60} sec")
return profiles
except Exception as E:
print(f"Failed to generate one profile for {file_path} due to error, {E}")
raise NotImplementedError(f"Failed to generate one profile for {file_path} due to error, {E}")
async def insert_one_profile(self, profile:Profile):
await ingest_one_profile(profile)
async def insert_bulk_profile(self, profiles:List[Profile]):
await ingest_bulk_profile(profiles)
async def get_profiles(self, criteria:str, limit:int):
retrieved_profiles = await retrieve_profile(input_user=criteria, limit=limit)
return retrieved_profiles
async def get_dataframe_profiles(self, profiles:List[Profile]) -> pd.DataFrame:
df = await pretty_profiles(profiles)
return df
# import asyncio
# # myagent = AutograderAgent(model=model_gemini)
# myagent2 = AutograderAgent(model=model_4o)
# folder_path="src/data/cvs"
# files_path = glob.glob(f"{folder_path}/*.pdf")
# print(len(files_path))
# # res = asyncio.run(myagent.generate_one(file_path=files_path[1]))
# res_bulk = asyncio.run(myagent2.generate_bulk(folder_path=folder_path))
# profiles = asyncio.run(helper_prepare_profiles(files_path, res_bulk))
# asyncio.run(myagent2.insert_bulk_profile(profiles))
|