ai_workflows / app /workflows /courses /lessons_extractor.py
theRealNG's picture
workflows(all): fix version spelling
5dc54c6
from langchain import callbacks
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from typing import List
from workflows.tools.scrape_website import WebpageScreenshot
from crewai import Agent, Task, Crew
from crewai_tools import ScrapeWebsiteTool
import base64
import os
import pprint
class LessonsExtractor:
def kickoff(self, inputs={}):
self.course_url = inputs["course_url"]
self._extract_lessons()
return {"run_id": self.run_id, "lessons": self.lessons}
def _extract_lessons(self):
self.course_webpage_content = self._scrape_webpage_content()
extractor_chain = self._build_lessons_extractor_chain()
pprint.pp("Extracting Lessons....")
with callbacks.collect_runs() as cb:
self.lessons = extractor_chain.invoke(
{"screenshot": self.course_webpage_content})["lessons"]
self.run_id = cb.traced_runs[0].id
print("Run ID: ", self.run_id)
print("Lessons")
pprint.pp(self.lessons)
def _scrape_webpage_content(self):
pprint.pp("Scraping Courses....")
webpage_content = WebpageScreenshot(self.course_url)
# Testing the screenshot taken
# image_data = base64.b64decode(webpage_content)
# with open("webpageScreenshot.png", "wb") as fh:
# fh.write(image_data)
print("Webpage Content:")
pprint.pp(webpage_content)
return webpage_content
def _build_lessons_extractor_chain(self):
course_parser = JsonOutputParser(pydantic_object=Course)
prompt = ChatPromptTemplate.from_messages([
SystemMessage(
"You are an expert in understanding a course webpage. "
"Your goal is to extract the course content that will be covered as part of the course from the screenshot of the course webpage. "
f"Formatting Instructions: {course_parser.get_format_instructions()}"
),
HumanMessage(
content=[
{"type": "text", "text": "Here is the course webpage screenshot"},
{"type": "image_url", "image_url": {
"url": f"data:image/png;base64,{self.course_webpage_content}",
"detail": "auto",
}}
]
)
])
llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
extractor_chian = (prompt | llm | course_parser).with_config({
"tags": ["courses"], "run_name": "Extracting Lessons",
"metadata": {
"version": "v1.0.0",
"growth_activity": "courses",
"env": os.environ["ENV"],
"model": os.environ["OPENAI_MODEL"],
}
})
return extractor_chian
class Lesson(BaseModel):
name: str = Field(description="Lesson name mentioned in the screenshot.")
concepts: List[str] = Field(description="What are the concepts mentioned in the screeshot "
"that the user will learn as part of this lesson. "
"If nothing is mentioned return an empty string."
)
class Course(BaseModel):
lessons: List[Lesson]