Spaces:
Runtime error
Runtime error
trace change
Browse files
app.py
CHANGED
|
@@ -15,18 +15,27 @@ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
|
|
| 15 |
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
|
| 16 |
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
if assistant_api_key == '':
|
| 32 |
assistant_api_key = st.secrets["API_KEY"]
|
|
@@ -36,17 +45,23 @@ else:
|
|
| 36 |
gpt_assistant = GPTAssistant(assistant_api_key)
|
| 37 |
|
| 38 |
|
| 39 |
-
html_content =
|
| 40 |
# check if html_content is an url, and show error if it is
|
| 41 |
-
if html_content:
|
| 42 |
-
if html_content.startswith("http"):
|
| 43 |
-
st.write("Please paste the HTML piece code, not the URL")
|
| 44 |
-
html_content = None
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
if html_content and extract_button:
|
| 49 |
try:
|
|
|
|
| 50 |
output = gpt_assistant.chain_response_format(html_content)
|
| 51 |
st.session_state['output_format'] = output
|
| 52 |
except NameError:
|
|
@@ -57,16 +72,16 @@ if html_content and extract_button:
|
|
| 57 |
if 'output_format' in st.session_state:
|
| 58 |
output_format = st.code(st.session_state['output_format'], language="json")
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
| 71 |
@traceable(run_type="tool")
|
| 72 |
def test_the_code(code, full_content):
|
|
@@ -89,5 +104,4 @@ if 'code_generated' in st.session_state:
|
|
| 89 |
if full_content and test_code:
|
| 90 |
html_data = full_content
|
| 91 |
result = None
|
| 92 |
-
test_the_code(st.session_state['code_generated_exec'], full_content=full_content)
|
| 93 |
-
|
|
|
|
| 15 |
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
|
| 16 |
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
|
| 17 |
|
| 18 |
+
@traceable(run_type="tool")
|
| 19 |
+
def start(run=False):
|
| 20 |
+
st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
|
| 21 |
+
|
| 22 |
+
with st.expander(label="Check out the video demo"):
|
| 23 |
+
yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
|
| 24 |
+
|
| 25 |
+
info_text = """
|
| 26 |
+
**Quick start** \n
|
| 27 |
+
Fill the input with <HTML code>.
|
| 28 |
+
* Choose a repeating element on the page, like a product on a list.
|
| 29 |
+
* Inspect the HTML code and copy the element.
|
| 30 |
+
|
| 31 |
+
After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
|
| 32 |
+
"""
|
| 33 |
+
st.write(info_text)
|
| 34 |
+
st.image("https://j.gifs.com/gpqvPl.gif")
|
| 35 |
+
if run:
|
| 36 |
+
return True
|
| 37 |
+
# use time library
|
| 38 |
+
start(run=True)
|
| 39 |
|
| 40 |
if assistant_api_key == '':
|
| 41 |
assistant_api_key = st.secrets["API_KEY"]
|
|
|
|
| 45 |
gpt_assistant = GPTAssistant(assistant_api_key)
|
| 46 |
|
| 47 |
|
| 48 |
+
html_content = None
|
| 49 |
# check if html_content is an url, and show error if it is
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
def html_content_input():
|
| 52 |
+
html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
|
| 53 |
+
if html_content:
|
| 54 |
+
if html_content.startswith("http"):
|
| 55 |
+
st.write("Please paste the HTML piece code, not the URL")
|
| 56 |
+
html_content = None
|
| 57 |
+
|
| 58 |
+
return st.button("Generate output format & code")
|
| 59 |
+
|
| 60 |
+
extract_button = html_content_input()
|
| 61 |
|
| 62 |
if html_content and extract_button:
|
| 63 |
try:
|
| 64 |
+
st.write("1/2: Generating the output format...")
|
| 65 |
output = gpt_assistant.chain_response_format(html_content)
|
| 66 |
st.session_state['output_format'] = output
|
| 67 |
except NameError:
|
|
|
|
| 72 |
if 'output_format' in st.session_state:
|
| 73 |
output_format = st.code(st.session_state['output_format'], language="json")
|
| 74 |
|
| 75 |
+
try:
|
| 76 |
+
st.write("2/2: Generating the code...")
|
| 77 |
+
python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
|
| 78 |
+
st.session_state['code_generated'] = python_code
|
| 79 |
+
st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
|
| 80 |
|
| 81 |
+
except NameError:
|
| 82 |
+
st.write("Complete the API key field")
|
| 83 |
+
except AuthenticationError:
|
| 84 |
+
st.write("Invalid API key")
|
| 85 |
|
| 86 |
@traceable(run_type="tool")
|
| 87 |
def test_the_code(code, full_content):
|
|
|
|
| 104 |
if full_content and test_code:
|
| 105 |
html_data = full_content
|
| 106 |
result = None
|
| 107 |
+
test_the_code(st.session_state['code_generated_exec'], full_content=full_content)
|
|
|