File size: 3,566 Bytes
4ad7a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dc54c6
4ad7a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from langchain import callbacks
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from typing import List
from workflows.tools.scrape_website import WebpageScreenshot
from crewai import Agent, Task, Crew
from crewai_tools import ScrapeWebsiteTool
import base64
import os
import pprint


class LessonsExtractor:
    def kickoff(self, inputs={}):
        self.course_url = inputs["course_url"]
        self._extract_lessons()

        return {"run_id": self.run_id, "lessons": self.lessons}

    def _extract_lessons(self):
        self.course_webpage_content = self._scrape_webpage_content()
        extractor_chain = self._build_lessons_extractor_chain()
        pprint.pp("Extracting Lessons....")
        with callbacks.collect_runs() as cb:
            self.lessons = extractor_chain.invoke(
                {"screenshot": self.course_webpage_content})["lessons"]
            self.run_id = cb.traced_runs[0].id
            print("Run ID: ", self.run_id)
        print("Lessons")
        pprint.pp(self.lessons)

    def _scrape_webpage_content(self):
        pprint.pp("Scraping Courses....")
        webpage_content = WebpageScreenshot(self.course_url)
        # Testing the screenshot taken
        # image_data = base64.b64decode(webpage_content)
        # with open("webpageScreenshot.png", "wb") as fh:
        #     fh.write(image_data)

        print("Webpage Content:")
        pprint.pp(webpage_content)
        return webpage_content

    def _build_lessons_extractor_chain(self):
        course_parser = JsonOutputParser(pydantic_object=Course)
        prompt = ChatPromptTemplate.from_messages([
            SystemMessage(
                "You are an expert in understanding a course webpage. "
                "Your goal is to extract the course content that will be covered as part of the course from the screenshot of the course webpage. "
                f"Formatting Instructions: {course_parser.get_format_instructions()}"
            ),
            HumanMessage(
                content=[
                    {"type": "text", "text": "Here is the course webpage screenshot"},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/png;base64,{self.course_webpage_content}",
                        "detail": "auto",
                    }}
                ]
            )
        ])
        llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)

        extractor_chian = (prompt | llm | course_parser).with_config({
            "tags": ["courses"], "run_name": "Extracting Lessons",
            "metadata": {
                "version": "v1.0.0",
                "growth_activity": "courses",
                "env": os.environ["ENV"],
                "model": os.environ["OPENAI_MODEL"],
            }
        })

        return extractor_chian


class Lesson(BaseModel):
    name: str = Field(description="Lesson name mentioned in the screenshot.")
    concepts: List[str] = Field(description="What are the concepts mentioned in the screeshot "
                                "that the user will learn as part of this lesson. "
                                "If nothing is mentioned return an empty string."
                                )


class Course(BaseModel):
    lessons: List[Lesson]