File size: 8,112 Bytes
fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda 55b7d0c fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 329beda fd58b95 55b7d0c fd58b95 55b7d0c fd58b95 55b7d0c fd58b95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
from helper import extract_html_content
from IPython.display import display, HTML
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.core.tools import FunctionTool
from llama_index.core.agent import FunctionCallingAgent
from llama_index.core import Settings
from llama_parse import LlamaParse
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import (
VectorStoreIndex,
StorageContext,
load_index_from_storage
)
import nest_asyncio
from llama_index.core.workflow import (
StartEvent,
StopEvent,
Workflow,
step,
Event,
Context
)
import json
from pathlib import Path
from dotenv import load_dotenv
import os
import asyncio
storage_dir = "./storage"
nest_asyncio.apply()
load_dotenv()
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
global_llm = Groq(api_key=GROQ_API_KEY, model="llama3-70b-8192")
global_embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = global_embed_model
documents = LlamaParse(
api_key=llama_cloud_api_key,
result_type="markdown",
content_guideline_instruction="This is a resume, gather related facts together and format it as "
"bullet points with headers"
).load_data("data/fake_resume.pdf")
print(documents[0].text)
index = VectorStoreIndex.from_documents(
documents,
embed_model=global_embed_model
)
query_engine = index.as_query_engine(llm=global_llm, similarity_top_k=5)
response = query_engine.query("What is this person's name and what was their most recent job?")
print(response)
index.storage_context.persist(persist_dir=storage_dir)
restored_index = None
# Check if the index is stored on disk
if os.path.exists(storage_dir):
# Load the index from disk
storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
restored_index = load_index_from_storage(storage_context)
else:
print("Index not found on disk.")
print("\n\n Reading back the index \n")
response = restored_index.as_query_engine(llm=global_llm, similarity_top_k=5)\
.query("What is this person's name and what was their most recent job?")
print(response)
print("\n\n" + "="*50, "\n\n")
def query_resume(q: str) -> str:
"""Answers questions about a specific resume."""
# we're using the query engine we already created above
response = query_engine.query(f"This is a question about the specific resume we have in our database: {q}")
return response.response
resume_tool = FunctionTool.from_defaults(fn=query_resume)
agent = FunctionCallingAgent.from_tools(
tools=[resume_tool],
llm=global_llm,
verbose=True
)
response = agent.chat("How many years of experience does the applicant have?")
print(response)
print("\n\n" + "="*50, "\n\n")
class ParseFormEvent(Event):
application_form: str
class QueryEvent(Event):
query: str
class ResponseEvent(Event):
response: str
# the first step will be setup
class RAGWorkflow(Workflow):
# define the LLM to work with
storage_dir = "./storage"
llm: Groq
query_engine: VectorStoreIndex
@step
async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:
self.llm = global_llm
self.storage_dir = storage_dir
if not ev.resume_file:
raise ValueError("No resume file provided")
if not ev.application_form:
raise ValueError("No application form provided")
# ingest the data and set up the query engine
if os.path.exists(self.storage_dir):
# you've already ingested the resume document
storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
index = load_index_from_storage(storage_context)
else:
# parse and load the resume document
documents = LlamaParse(
result_type="markdown",
content_guideline_instruction="This is a resume, gather related facts together and format it as "
"bullet points with headers"
).load_data(ev.resume_file)
# embed and index the documents
index = VectorStoreIndex.from_documents(
documents,
embed_model=global_embed_model
)
index.storage_context.persist(persist_dir=self.storage_dir)
# create a query engine
self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)
# you no longer need a query to be passed in,
# you'll be generating the queries instead
# let's pass the application form to a new step to parse it
return ParseFormEvent(application_form=ev.application_form)
@step
async def parse_form(self, ctx: Context, ev: ParseFormEvent) -> QueryEvent:
parser = LlamaParse(
result_type="markdown",
content_guideline_instruction="This is a job application form. Create a list of all the fields that "
"need to be filled in.",
formatting_instruction="Return a bulleted list of the fields ONLY."
)
# get the LLM to convert the parsed form into JSON
result = parser.load_data(ev.application_form)[0]
raw_json = self.llm.complete(
f"""
This is a parsed form.
Convert it into a JSON object containing only the list
of fields to be filled in, in the form {{ fields: [...] }}.
<form>{result.text}</form>.
Return JSON ONLY, no markdown.
""")
fields = json.loads(raw_json.text)["fields"]
# new!
# generate one query for each of the fields, and fire them off
for field in fields:
ctx.send_event(QueryEvent(
field=field,
query=f"How would you answer this question about the candidate? {field}"
))
# store the number of fields so we know how many to wait for later
await ctx.set("total_fields", len(fields))
return
@step
async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
response = self.query_engine.query(
f"This is a question about the specific resume we have in our database: {ev.query}")
return ResponseEvent(field=ev.field, response=response.response)
# new!
@step
async def fill_in_application(self, ctx: Context, ev: ResponseEvent) -> StopEvent:
# get the total number of fields to wait for
total_fields = await ctx.get("total_fields")
responses = ctx.collect_events(ev, [ResponseEvent] * total_fields)
if responses is None:
return None # do nothing if there's nothing to do yet
# we've got all the responses!
responseList = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses)
result = self.llm.complete(f"""
You are given a list of fields in an application form and responses to
questions about those fields from a resume. Combine the two into a list of
fields and succinct, factual answers to fill in those fields.
<responses>
{responseList}
</responses>
""")
return StopEvent(result=result)
async def main():
w = RAGWorkflow(timeout=120, verbose=False)
result = await w.run(
resume_file="data/fake_resume.pdf",
application_form="data/fake_application_form.pdf"
)
print(result)
# Display of the workflow
workflow_file = Path(__file__).parent / "workflows" / "form_parsing_workflow.html"
draw_all_possible_flows(w, filename=str(workflow_file))
html_content = extract_html_content(str(workflow_file))
display(HTML(html_content), metadata=dict(isolated=True))
if __name__ == "__main__":
asyncio.run(main())
|