from langchain import callbacks from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import SystemMessage, HumanMessage from langchain_core.output_parsers import JsonOutputParser from langchain_google_genai import ChatGoogleGenerativeAI from langchain_openai import ChatOpenAI from pydantic import BaseModel, Field from typing import List from workflows.tools.scrape_website import WebpageScreenshot from crewai import Agent, Task, Crew from crewai_tools import ScrapeWebsiteTool import base64 import os import pprint class LessonsExtractor: def kickoff(self, inputs={}): self.course_url = inputs["course_url"] self._extract_lessons() return {"run_id": self.run_id, "lessons": self.lessons} def _extract_lessons(self): self.course_webpage_content = self._scrape_webpage_content() extractor_chain = self._build_lessons_extractor_chain() pprint.pp("Extracting Lessons....") with callbacks.collect_runs() as cb: self.lessons = extractor_chain.invoke( {"screenshot": self.course_webpage_content})["lessons"] self.run_id = cb.traced_runs[0].id print("Run ID: ", self.run_id) print("Lessons") pprint.pp(self.lessons) def _scrape_webpage_content(self): pprint.pp("Scraping Courses....") webpage_content = WebpageScreenshot(self.course_url) # Testing the screenshot taken # image_data = base64.b64decode(webpage_content) # with open("webpageScreenshot.png", "wb") as fh: # fh.write(image_data) print("Webpage Content:") pprint.pp(webpage_content) return webpage_content def _build_lessons_extractor_chain(self): course_parser = JsonOutputParser(pydantic_object=Course) prompt = ChatPromptTemplate.from_messages([ SystemMessage( "You are an expert in understanding a course webpage. " "Your goal is to extract the course content that will be covered as part of the course from the screenshot of the course webpage. " f"Formatting Instructions: {course_parser.get_format_instructions()}" ), HumanMessage( content=[ {"type": "text", "text": "Here is the course webpage screenshot"}, {"type": "image_url", "image_url": { "url": f"data:image/png;base64,{self.course_webpage_content}", "detail": "auto", }} ] ) ]) llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2) extractor_chian = (prompt | llm | course_parser).with_config({ "tags": ["courses"], "run_name": "Extracting Lessons", "metadata": { "version": "v1.0.0", "growth_activity": "courses", "env": os.environ["ENV"], "model": os.environ["OPENAI_MODEL"], } }) return extractor_chian class Lesson(BaseModel): name: str = Field(description="Lesson name mentioned in the screenshot.") concepts: List[str] = Field(description="What are the concepts mentioned in the screeshot " "that the user will learn as part of this lesson. " "If nothing is mentioned return an empty string." ) class Course(BaseModel): lessons: List[Lesson]