Spaces:

DataPrism
/

GPT-auto-webscraping

Runtime error

App Files Files Community

GianJSX commited on Jun 15, 2023

Commit

15ad492

1 Parent(s): 371198a

chains

Browse files

Files changed (4) hide show

chains/code_generator/base.py +19 -0
chains/code_generator/templates.py +55 -0
chains/output_format/base.py +19 -0
chains/output_format/templates.py +28 -0

chains/code_generator/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory
+from chains.code_generator.templates import chat_script_prompt
+def chain_code_generator(llm) -> LLMChain:
+    # Memory
+    script_memory = ConversationBufferMemory(
+        input_key="output_format", memory_key="chat_history"
+    )
+    # Chain
+    return LLMChain(
+        llm=llm,
+        prompt=chat_script_prompt,
+        verbose=True,
+        output_key="script",
+        memory=script_memory,
+    )

chains/code_generator/templates.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from langchain.prompts import (
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+    ChatPromptTemplate,
+    PromptTemplate,
+)
+# Prompt templates
+system_template_script = PromptTemplate(
+    input_variables=["output_format", "html_content"],
+    template="""You are a helpful assitant that helps people create python scripts for web scraping.
+    --------------------------------
+    The example of the html content is: {html_content}
+    --------------------------------
+    You have to create a python function that extract information from an html code using web scrapping.
+    Try to select the most low-level class that is common among the elements to make de find_all function.
+    Your answer SHOULD only contain the python function code without any aditional word or character.
+    Import the used libraries above the function definition.
+    The function name must be extract_info.
+    The function have to receive the html data as a parameter.
+    Your function needs to extract information for all the elements with similar attributes.
+    Before calling .text or ['href'] methods, check if the element exists.
+    ----------------
+    FINAL ANSWER EXAMPLE:
+    from bs4 import BeautifulSoup
+    def extract_info(html):
+        ...CODE...
+        return {output_format}
+    ----------------
+    Always check if the element exists before calling some method.
+    """,
+)
+human_template_script = PromptTemplate(input_variables=[], template="give me the code")
+# Chat Prompt objects
+system_template_script_prompt = SystemMessagePromptTemplate.from_template(
+    system_template_script.template
+)
+human_template_script_prompt = HumanMessagePromptTemplate.from_template(
+    human_template_script.template
+)
+chat_script_prompt = ChatPromptTemplate.from_messages(
+    [system_template_script_prompt, human_template_script_prompt]
+)

chains/output_format/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory
+from chains.output_format.templates import output_format_chat_prompt
+def chain_output_format(llm) -> LLMChain:
+    # memory
+    html_memory = ConversationBufferMemory(
+        input_key="html_content", memory_key="chat_history"
+    )
+    # chain
+    return LLMChain(
+        llm=llm,
+        prompt=output_format_chat_prompt,
+        verbose=True,
+        output_key="output_format",
+        memory=html_memory,
+    )

chains/output_format/templates.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
+# prompt templates
+system_template_output_format = PromptTemplate(
+    input_variables = ['html_content'],
+    template='''You are a helpful assitant that helps people extract JSON information from HTML content.
+    The input is a HTML content.
+    The expected output is a JSON with a relevant information in the following html: {html_content}
+    Try to extract as much information as possible. Including images, links, etc.
+    The assitant answer should ONLY contain the JSON information without any aditional word or character.
+    The expected output format is an array of objects.
+    ''')
+human_template_output_format = PromptTemplate(
+    input_variables = ['html_content'],
+    template='this is the html content: {html_content}'
+)
+# chat prompts objects
+system_message_prompt = SystemMessagePromptTemplate.from_template(system_template_output_format.template)
+human_message_prompt = HumanMessagePromptTemplate.from_template(human_template_output_format.template)
+output_format_chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])