Spaces:

beautiful-code
/

ai_workflows

Runtime error

App Files Files Community

ai_workflows / app /workflows /courses /lessons_extractor.py

theRealNG

workflows(all): fix version spelling

5dc54c6 over 1 year ago

raw

history blame contribute delete

3.57 kB

	from langchain import callbacks
	from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
	from langchain_core.messages import SystemMessage, HumanMessage
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_openai import ChatOpenAI
	from pydantic import BaseModel, Field
	from typing import List
	from workflows.tools.scrape_website import WebpageScreenshot
	from crewai import Agent, Task, Crew
	from crewai_tools import ScrapeWebsiteTool
	import base64
	import os
	import pprint


	class LessonsExtractor:
	def kickoff(self, inputs={}):
	self.course_url = inputs["course_url"]
	self._extract_lessons()

	return {"run_id": self.run_id, "lessons": self.lessons}

	def _extract_lessons(self):
	self.course_webpage_content = self._scrape_webpage_content()
	extractor_chain = self._build_lessons_extractor_chain()
	pprint.pp("Extracting Lessons....")
	with callbacks.collect_runs() as cb:
	self.lessons = extractor_chain.invoke(
	{"screenshot": self.course_webpage_content})["lessons"]
	self.run_id = cb.traced_runs[0].id
	print("Run ID: ", self.run_id)
	print("Lessons")
	pprint.pp(self.lessons)

	def _scrape_webpage_content(self):
	pprint.pp("Scraping Courses....")
	webpage_content = WebpageScreenshot(self.course_url)
	# Testing the screenshot taken
	# image_data = base64.b64decode(webpage_content)
	# with open("webpageScreenshot.png", "wb") as fh:
	# fh.write(image_data)

	print("Webpage Content:")
	pprint.pp(webpage_content)
	return webpage_content

	def _build_lessons_extractor_chain(self):
	course_parser = JsonOutputParser(pydantic_object=Course)
	prompt = ChatPromptTemplate.from_messages([
	SystemMessage(
	"You are an expert in understanding a course webpage. "
	"Your goal is to extract the course content that will be covered as part of the course from the screenshot of the course webpage. "
	f"Formatting Instructions: {course_parser.get_format_instructions()}"
	),
	HumanMessage(
	content=[
	{"type": "text", "text": "Here is the course webpage screenshot"},
	{"type": "image_url", "image_url": {
	"url": f"data:image/png;base64,{self.course_webpage_content}",
	"detail": "auto",
	}}
	]
	)
	])
	llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)

	extractor_chian = (prompt \| llm \| course_parser).with_config({
	"tags": ["courses"], "run_name": "Extracting Lessons",
	"metadata": {
	"version": "v1.0.0",
	"growth_activity": "courses",
	"env": os.environ["ENV"],
	"model": os.environ["OPENAI_MODEL"],
	}
	})

	return extractor_chian


	class Lesson(BaseModel):
	name: str = Field(description="Lesson name mentioned in the screenshot.")
	concepts: List[str] = Field(description="What are the concepts mentioned in the screeshot "
	"that the user will learn as part of this lesson. "
	"If nothing is mentioned return an empty string."
	)


	class Course(BaseModel):
	lessons: List[Lesson]